Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -1,136 +1,164 @@
import contextlib
import gc
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from collections import OrderedDict
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock
import pytest
import ray
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
import vllm
from vllm.config import LoRAConfig
from vllm.distributed import destroy_model_parallel, initialize_model_parallel
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear)
from vllm.distributed import (
cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear,
)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces import SupportsLoRA
from vllm.platforms import current_platform
def cleanup():
destroy_model_parallel()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
return not request.node.get_closest_marker("skip_global_cleanup")
@pytest.fixture(autouse=True)
def cleanup_fixture():
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield
cleanup()
if should_do_global_cleanup_after_test:
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
def dist_init():
if not torch.distributed.is_initialized():
temp_file = tempfile.mkstemp()[1]
torch.distributed.init_process_group(
backend="nccl",
world_size=1,
rank=0,
init_method=f"file://{temp_file}",
)
torch.distributed.all_reduce(torch.zeros(1).cuda())
temp_file = tempfile.mkstemp()[1]
backend = "nccl"
if current_platform.is_cpu() or current_platform.is_tpu():
backend = "gloo"
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend=backend,
)
initialize_model_parallel(1, 1)
yield
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
def dist_init_torch_only():
if torch.distributed.is_initialized():
return
backend = "nccl"
if current_platform.is_cpu():
backend = "gloo"
temp_file = tempfile.mkstemp()[1]
torch.distributed.init_process_group(
backend="nccl",
world_size=1,
rank=0,
init_method=f"file://{temp_file}",
world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
)
class DummyLoRAModel(nn.Sequential, SupportsLoRA):
pass
@pytest.fixture
def dummy_model() -> nn.Module:
model = nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
])),
),
("act2", nn.ReLU()),
("output", ColumnParallelLinear(50, 10)),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
("sampler", Sampler())
]))
model = DummyLoRAModel(
OrderedDict(
[
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict(
[
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("output", ColumnParallelLinear(50, 10)),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]
)
)
model.config = MagicMock()
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
@pytest.fixture
def dummy_model_gate_up() -> nn.Module:
model = nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
])),
),
("act2", nn.ReLU()),
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
("sampler", Sampler())
]))
model = DummyLoRAModel(
OrderedDict(
[
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict(
[
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]
)
)
model.config = MagicMock()
model.packed_modules_mapping = {
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
@pytest.fixture(scope="session")
def sql_lora_files():
return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
@pytest.fixture(scope="session")
def mixtral_lora_files():
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
@pytest.fixture(scope="session")
def gemma_lora_files():
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
# Note: this module has incorrect adapter_config.json to test
# https://github.com/vllm-project/vllm/pull/5909/files.
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
@pytest.fixture(scope="session")
@@ -149,31 +177,85 @@ def baichuan_zero_lora_files():
return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
@pytest.fixture(scope="session")
def baichuan_regex_lora_files():
return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
@pytest.fixture(scope="session")
def ilama_lora_files():
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
@pytest.fixture(scope="session")
def minicpmv_lora_files():
return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
@pytest.fixture(scope="session")
def qwen2vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
@pytest.fixture(scope="session")
def qwen25vl_base_huggingface_id():
# used as a base model for testing with qwen25vl lora adapter
return "Qwen/Qwen2.5-VL-3B-Instruct"
@pytest.fixture(scope="session")
def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
@pytest.fixture
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
cleanup()
get_model_old = get_model
@pytest.fixture(scope="session")
def deepseekv2_lora_files():
return snapshot_download(repo_id="wuchen01/DeepSeek-V2-Lite-Chat-All-LoRA")
def get_model_patched(*, model_config, device_config, **kwargs):
kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
return get_model_old(model_config=model_config,
device_config=device_config,
**kwargs)
with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
yield engine.llm_engine
del engine
cleanup()
@pytest.fixture(scope="session")
def gptoss20b_lora_files():
return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql")
@pytest.fixture(scope="session")
def qwen3moe_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen3-moe-text2sql-spider")
@pytest.fixture(scope="session")
def olmoe_lora_files():
return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
@pytest.fixture(scope="session")
def qwen3_lora_files():
return snapshot_download(repo_id="charent/self_cognition_Alice")
@pytest.fixture(scope="session")
def llama32_lora_huggingface_id():
# huggingface repo id is used to test lora runtime downloading.
return "jeeejeee/llama32-3b-text2sql-spider"
@pytest.fixture(scope="session")
def llama32_lora_files(llama32_lora_huggingface_id):
return snapshot_download(repo_id=llama32_lora_huggingface_id)
@pytest.fixture
def llama_2_7b_model_extra_embeddings(
llama_2_7b_engine_extra_embeddings) -> nn.Module:
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
model_runner.model)
def reset_default_device():
"""
Some tests, such as `test_punica_ops.py`, explicitly set the
default device, which can affect subsequent tests. Adding this fixture
helps avoid this problem.
"""
original_device = torch.get_default_device()
yield
torch.set_default_device(original_device)

113
tests/lora/test_add_lora.py Normal file
View File

@@ -0,0 +1,113 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils.async_utils import merge_async_iterators
MODEL_PATH = "zai-org/chatglm3-6b"
LORA_RANK = 64
DEFAULT_MAX_LORAS = 4 * 3
def get_lora_requests(lora_path) -> list[LoRARequest]:
lora_requests: list[LoRARequest] = [
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
for i in range(1, DEFAULT_MAX_LORAS + 1)
]
return lora_requests
async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
sampling_params = SamplingParams(
n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
)
generators = []
start = time.perf_counter()
for lora_request in lora_requests:
lora_int_id = lora_request.lora_int_id
generator = llm.generate(
prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None), # type: ignore
sampling_params=sampling_params,
lora_request=lora_request,
request_id=f"test{lora_int_id}",
)
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
end = time.perf_counter()
return end - start
@pytest.mark.asyncio
async def test_add_lora(chatglm3_lora_files):
"""
The add_lora function is used to preload some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() preloading and once without.
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
"""
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8, # avoid OOM
trust_remote_code=True,
enforce_eager=True,
)
# split lora_requests into 3 parts
part_size = len(lora_requests) // 3
dummy_run_requests = lora_requests[:part_size]
warmup_run_requests = lora_requests[part_size : part_size * 2]
cold_run_requests = lora_requests[part_size * 2 :]
async with build_async_engine_client_from_engine_args(engine_args) as llm:
# Dummy run - So any 1-time functionality like triton kernel compilation
# is complete here.
await requests_processing_time(llm, dummy_run_requests)
# Run with warmup
add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
add_lora_results = await asyncio.gather(*add_lora_tasks)
# Test that all all_lora calls are successful.
assert all(add_lora_results)
time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
# Run without any warmup
time_cold_start = await requests_processing_time(llm, cold_run_requests)
print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
assert time_with_add_lora < time_cold_start, (
f"time_with_add_lora={time_with_add_lora}, "
f"time_cold_start={time_cold_start}"
"The engine request processing time with LoRA pre-loading "
"must be less than the version that does on-demand LoRA loading."
)

View File

@@ -1,108 +0,0 @@
import pytest
import vllm
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "baichuan-inc/Baichuan-7B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
def do_sample(llm, lora_path: str, lora_id: int) -> str:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
),
]
print(prompts)
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_baichuan_lora(baichuan_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True)
expected_lora_output = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501
"SELECT name , country , age FROM singer ORDER BY age ASC",
]
output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i] == expected_lora_output[i]
output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i] == expected_lora_output[i]
@pytest.mark.skip("Requires multiple GPUs")
def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
llm_tp1 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=1,
trust_remote_code=True)
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
del llm_tp1
cleanup()
llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=2,
trust_remote_code=True)
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
del llm_tp2
cleanup()
assert output_tp1 == output_tp2
llm_tp4 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True)
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
del llm_tp4
cleanup()
assert output_tp1 == output_tp4

View File

@@ -1,57 +0,0 @@
import vllm
from vllm.lora.request import LoRARequest
MODEL_PATH = "THUDM/chatglm3-6b"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
def do_sample(llm, lora_path: str, lora_id: int) -> str:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
),
]
print(prompts)
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True)
expected_lora_output = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
"SELECT name , country , age FROM singer ORDER BY age",
]
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i] == expected_lora_output[i]
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i] == expected_lora_output[i]

View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import vllm
import vllm.config
from vllm.lora.request import LoRARequest
from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "zai-org/chatglm3-6b"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
"SELECT name , country , age FROM singer ORDER BY age",
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
),
PROMPT_TEMPLATE.format(
query=(
"Show name, country, age for all singers ordered "
"by age from the oldest to the youngest."
)
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@create_new_process_for_each_test()
def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=512,
enable_lora=True,
max_loras=2,
max_num_seqs=16,
max_lora_rank=64,
trust_remote_code=True,
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@multi_gpu_test(num_gpus=4)
def test_chatglm3_lora_tp4(chatglm3_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=512,
enable_lora=True,
max_loras=2,
max_lora_rank=64,
max_num_seqs=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@multi_gpu_test(num_gpus=4)
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(
MODEL_PATH,
max_model_len=512,
enable_lora=True,
max_loras=2,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
gpu_memory_utilization=0.8,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -0,0 +1,101 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# NOTE To avoid overloading the CI pipeline, this test script will
# not be triggered on CI and is primarily intended for local testing
# and verification.
import vllm
from vllm.lora.request import LoRARequest
from ..utils import multi_gpu_test
MODEL_PATH = "deepseek-ai/DeepSeek-V2-Lite-Chat"
PROMPT_TEMPLATE = "<begin▁of▁sentence>You are a helpful assistant.\n\nUser: {context}\n\nAssistant:" # noqa: E501
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int):
prompts = [
PROMPT_TEMPLATE.format(context="Who are you?"),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# return generated_texts
expected_lora_output = [
"I am \u5f20\u5b50\u8c6a, an AI assistant developed by \u9648\u58eb\u680b.", # noqa: E501
]
for i in range(len(expected_lora_output)):
assert generated_texts[i].startswith(expected_lora_output[i])
def test_deepseekv2_lora(deepseekv2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, deepseekv2_lora_files, 1)
def test_deepseekv2(deepseekv2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
)
generate_and_test(llm, deepseekv2_lora_files, 1)
@multi_gpu_test(num_gpus=2)
def test_deepseekv2_tp2(deepseekv2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
tensor_parallel_size=2,
)
generate_and_test(llm, deepseekv2_lora_files, 2)
@multi_gpu_test(num_gpus=4)
def test_deepseekv2_tp4(deepseekv2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
tensor_parallel_size=4,
)
generate_and_test(llm, deepseekv2_lora_files, 2)

View File

@@ -0,0 +1,157 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for applying default registered multimodal loras.
"""
import os
import unittest.mock as mock
import pytest
from huggingface_hub import snapshot_download
from vllm.lora.request import LoRARequest
from ..conftest import AudioTestAssets, VllmRunner
from ..utils import create_new_process_for_each_test
MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>" # noqa: E501
# Responses are greedy decoded; we just check the end of
# the generated text. If the lora is inactive, this model
# generates commentary on the transcription.
RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
VLLM_RUNNER_BASE_KWARGS = {
"model_name": MODEL_PATH,
"dtype": "half",
"enable_lora": "True",
"max_num_seqs": 2,
"max_lora_rank": 320,
# Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change.
"max_model_len": 4096,
"gpu_memory_utilization": 0.8,
"limit_mm_per_prompt": {"audio": 1},
"enforce_eager": True,
}
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
# Apply any additional kwargs as overrides to the base kwargs
vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
vllm_outputs_with_default_lora = [
vllm_model.generate_greedy(
prompts,
max_tokens=128,
audios=audios,
lora_request=lora_request,
)
for prompts, audios in inputs
]
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
@create_new_process_for_each_test()
def test_active_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that we can use the default audio lora."""
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
@create_new_process_for_each_test()
def test_inactive_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that modalities are filtered properly."""
# Default image lora won't be active since we only pass audio
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"image": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
)
@create_new_process_for_each_test()
def test_default_mm_lora_succeeds_with_redundant_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that redundantly providing the lora works."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
@create_new_process_for_each_test()
def test_default_mm_lora_fails_with_overridden_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that if the lora_request conflicts with default_mm_loras,
we use the lora_request."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
default_mm_loras={"audio": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
@create_new_process_for_each_test()
def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
class MockEngineException(Exception):
pass
# Regression test for ensuring default multimodal lora resolution
# does not expand the lora req if the prompt type is a string.
vllm_runner_kwargs = {
**VLLM_RUNNER_BASE_KWARGS,
**{"default_mm_loras": {"audio": AUDIO_LORA_PATH}},
}
# Avoid the full generation call since these tests are expensive;
# just check what lora request is actually submitted to the engine
mock_err = "Engine is mocked for this test"
with (
mock.patch(
"vllm.v1.engine.llm_engine.LLMEngine.add_request",
side_effect=MockEngineException(mock_err),
) as mock_add_request,
vllm_runner(**vllm_runner_kwargs) as vllm_model,
):
# Die once we actually submit the request to the engine
with pytest.raises(MockEngineException):
vllm_model.llm.generate(prompts=AUDIO_PROMPT)
# Then check to make sure the submitted lora request
# and text prompt were zipped together correctly
engine_args, engine_kwargs = mock_add_request.call_args
assert engine_kwargs["lora_request"] is None
assert engine_kwargs["prompt_text"] == AUDIO_PROMPT

View File

@@ -0,0 +1,523 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import pytest
import torch
from tests.utils import multi_gpu_test
from vllm import _custom_ops as ops
from vllm.distributed import (
init_distributed_environment,
initialize_model_parallel,
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce,
)
from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size,
)
from vllm.lora.ops.triton_ops import fused_moe_lora
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port
@pytest.fixture(autouse=True)
def reset_device(reset_default_device):
pass
def round_up(x, base):
return ((x + base - 1) // base) * base
def CEILDIV(x, y):
return (x + y - 1) // y
def assign_loras_to_tokens(num_tokens: int, num_sequences: int, max_loras: int):
"""
Split `num_tokens` into `num_sequences` sequences.
Each sequence randomly selects 1 LoRA index from [0, max_loras),
and all tokens in that sequence are assigned this LoRA index.
Args:
num_tokens (int): Total number of tokens.
num_sequences (int): Number of sequences to split the tokens into.
max_loras (int): Total number of available LoRA modules.
Returns:
torch.Tensor: 1D tensor of shape [num_tokens], where each value
is the LoRA index assigned to that token.
"""
assert num_sequences > 0 and max_loras > 0
assert num_tokens >= num_sequences, "num_tokens must be >= num_sequences"
# Compute token distribution per sequence (distribute remainder evenly)
tokens_per_seq = num_tokens // num_sequences
remainder = num_tokens % num_sequences
token_lora_mapping = torch.empty(num_tokens, dtype=torch.int32)
start = 0
for seq_idx in range(num_sequences):
# Determine the token range for this sequence
end = start + tokens_per_seq + (1 if seq_idx < remainder else 0)
# Randomly select one LoRA ID for this sequence
lora_id = random.randint(0, max_loras - 1)
# Assign the same LoRA ID to all tokens in this sequence
token_lora_mapping[start:end] = lora_id
start = end
return token_lora_mapping
def assign_experts_to_tokens(num_tokens: int, num_experts: int, top_k_num: int):
"""
For each token, randomly select `top_k_num` distinct experts out of `num_experts`,
and assign normalized random weights that sum to 1.
Args:
num_tokens (int): Total number of tokens.
num_experts (int): Total number of available experts.
top_k_num (int): Number of experts to select per token.
Returns:
expert_indices (torch.Tensor): shape [num_tokens, top_k_num],
expert index for each token.
expert_weights (torch.Tensor): shape [num_tokens, top_k_num],
normalized weights (sum = 1 per row).
"""
assert top_k_num <= num_experts, "top_k_num must be <= num_experts"
# Randomly select top_k_num distinct experts for each token
expert_indices = torch.empty((num_tokens, top_k_num), dtype=torch.int32)
for i in range(num_tokens):
# Randomly choose unique expert indices
selected = torch.randperm(num_experts)[:top_k_num]
expert_indices[i] = selected
# Generate random weights and normalize along dim=1
expert_weights = torch.rand((num_tokens, top_k_num), dtype=torch.float32)
expert_weights = expert_weights / expert_weights.sum(dim=1, keepdim=True)
return expert_indices, expert_weights
def sample_data(
num_tokens: int,
num_sequences: int,
max_loras: int,
num_experts: int,
top_k_num: int,
):
topk_ids, topk_weights = assign_experts_to_tokens(
num_tokens, num_experts, top_k_num
)
token_lora_mapping = assign_loras_to_tokens(num_tokens, num_sequences, max_loras)
return topk_ids, topk_weights, token_lora_mapping
def use_fused_moe_lora_kernel(
topk_ids,
topk_weights,
token_lora_mapping,
max_lora_rank,
top_k_num,
lora_a_stacked,
lora_b_stacked,
hidden_states,
output,
max_loras,
num_experts,
block_size,
fully_sharded=False,
offset=0,
):
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
# init output tensors
sorted_token_ids = torch.empty(
(max_loras * max_num_tokens_padded,),
dtype=torch.int32,
)
expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32)
num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32)
adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32)
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32)
# call kernel
ops.moe_lora_align_block_size(
topk_ids,
token_lora_mapping,
num_experts,
block_size,
max_loras,
max_num_tokens_padded,
max_num_m_blocks,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
adapter_enabled,
lora_ids,
)
config = {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"NUM_WARPS": 4,
"NUM_STAGES": 3,
"SPLIT_K": 1,
}
mul_routed_weight = False
expert_ids = expert_ids.view(max_loras, -1)
sorted_token_ids = sorted_token_ids.view(max_loras, -1)
fused_moe_lora(
output,
hidden_states,
lora_a_stacked,
lora_b_stacked,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
max_lora_rank,
top_k_num,
lora_ids,
adapter_enabled,
config["BLOCK_SIZE_M"],
config["BLOCK_SIZE_N"],
config["BLOCK_SIZE_K"],
config["GROUP_SIZE_M"],
config["NUM_WARPS"],
config["NUM_STAGES"],
config["SPLIT_K"],
config["BLOCK_SIZE_M"],
config["BLOCK_SIZE_N"],
config["BLOCK_SIZE_K"],
config["GROUP_SIZE_M"],
config["NUM_WARPS"],
config["NUM_STAGES"],
config["SPLIT_K"],
mul_routed_weight,
fully_sharded=fully_sharded,
offset=offset,
)
def use_torch(
hidden_states,
token_lora_mapping,
topk_ids,
lora_a_stacked,
lora_b_stacked,
top_k_num,
):
outputs = []
for i in range(hidden_states.shape[0]):
lora_idx = token_lora_mapping[i]
expert_ids = topk_ids[i]
lora_a = lora_a_stacked[0][lora_idx][expert_ids]
lora_b = lora_b_stacked[0][lora_idx][expert_ids]
tensors = [
hidden_states[i] @ lora_a[x].T @ lora_b[x].T for x in range(top_k_num)
]
outputs.append(torch.stack(tensors, dim=0))
return torch.stack(outputs, dim=0)
DTYPES = [torch.float16, torch.bfloat16]
DEVICES = [f"cuda:{0}"]
SEED = [42]
@pytest.mark.parametrize("num_tokens", [100])
@pytest.mark.parametrize("top_k_num", [6, 12])
@pytest.mark.parametrize("num_experts", [64])
@pytest.mark.parametrize("max_loras", [4, 6, 16])
@pytest.mark.parametrize("N", [1408])
@pytest.mark.parametrize("K", [2048])
@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
def test_fused_moe_lora_kernel(
num_tokens,
top_k_num,
num_experts,
max_loras,
N,
K,
max_lora_rank,
block_size,
dtype,
device,
seed,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
# the number of randomly generated sentences.
num_sequences = 10
# generate data
topk_ids, topk_weights, token_lora_mapping = sample_data(
num_tokens, num_sequences, max_loras, num_experts, top_k_num
)
# init lora weights
lora_a_stacked = [
torch.rand(
(
max_loras,
num_experts,
max_lora_rank,
K,
),
dtype=dtype,
)
]
lora_b_stacked = [
torch.rand(
(
max_loras,
num_experts,
N,
max_lora_rank,
),
dtype=dtype,
)
]
hidden_states = torch.rand(
(
num_tokens,
K,
),
dtype=dtype,
)
# fused_moe_lora_kernel output
output = torch.zeros((num_tokens, top_k_num, N), dtype=dtype)
use_fused_moe_lora_kernel(
topk_ids,
topk_weights,
token_lora_mapping,
max_lora_rank,
top_k_num,
lora_a_stacked,
lora_b_stacked,
hidden_states,
output,
max_loras,
num_experts,
block_size,
)
# pytorch output
output2 = use_torch(
hidden_states,
token_lora_mapping,
topk_ids,
lora_a_stacked,
lora_b_stacked,
top_k_num,
)
torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("num_tokens", [100])
@pytest.mark.parametrize("top_k_num", [6])
@pytest.mark.parametrize("num_experts", [64])
@pytest.mark.parametrize("max_loras", [4])
@pytest.mark.parametrize("N", [1408])
@pytest.mark.parametrize("K", [2048])
@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("column_parallel", [True, False])
def test_fused_moe_lora_kernel_fully_sharded(
num_tokens,
top_k_num,
num_experts,
max_loras,
N,
K,
max_lora_rank,
block_size,
dtype,
seed,
column_parallel,
):
current_platform.seed_everything(seed)
# the number of randomly generated sentences.
num_sequences = 10
# generate data
topk_ids, topk_weights, token_lora_mapping = sample_data(
num_tokens, num_sequences, max_loras, num_experts, top_k_num
)
def run_torch_spawn(fn, nprocs):
torch.multiprocessing.spawn(
fn,
args=(
nprocs,
f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
dtype,
seed,
N,
K,
num_tokens,
topk_ids,
topk_weights,
token_lora_mapping,
max_lora_rank,
top_k_num,
max_loras,
num_experts,
block_size,
column_parallel,
),
nprocs=nprocs,
)
run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2)
def use_fused_moe_lora_kernel_tensor_parallel(
local_rank,
world_size,
init_method,
dtype,
seed,
N,
K,
num_tokens,
topk_ids,
topk_weights,
token_lora_mapping,
max_lora_rank,
top_k_num,
max_loras,
num_experts,
block_size,
column_parallel,
):
def _get_shard_slice(shard_size):
return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
current_platform.seed_everything(seed)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
init_distributed_environment(
world_size=world_size,
rank=local_rank,
local_rank=local_rank,
distributed_init_method=init_method,
)
initialize_model_parallel(world_size, 1)
tp_size = get_tensor_model_parallel_world_size()
input_dim = K if column_parallel else N
output_dim = N if column_parallel else K
# init lora weights
lora_a = torch.rand(
(
max_loras,
num_experts,
max_lora_rank,
input_dim,
),
dtype=dtype,
)
lora_b = torch.rand(
(
max_loras,
num_experts,
output_dim,
max_lora_rank,
),
dtype=dtype,
)
hidden_states = torch.rand(
(
num_tokens,
input_dim,
),
dtype=dtype,
)
output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype)
topk_ids = topk_ids.to(device)
topk_weights = topk_weights.to(device)
token_lora_mapping = token_lora_mapping.to(device)
ref_output = use_torch(
hidden_states,
token_lora_mapping,
topk_ids,
[lora_a],
[lora_b],
top_k_num,
)
if column_parallel:
# Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim,
# and Lora B is sliced along the output dim
lora_a_shard_size = max_lora_rank // tp_size
lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :]
max_lora_rank = lora_a_shard_size
offset = 0
lora_b_shard_size = output_dim // tp_size
lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous()
else:
# Row parallel (e.g. down proj): LoRA A is sliced along the input dim,
# and LoRA B is sliced along the output dim
lora_a_shard_size = input_dim // tp_size
lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)]
hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)]
lora_b_shard_size = output_dim // tp_size
lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
offset = lora_b_shard_size * local_rank
use_fused_moe_lora_kernel(
topk_ids,
topk_weights,
token_lora_mapping,
max_lora_rank,
top_k_num,
[lora_a],
[lora_b],
hidden_states,
output,
max_loras,
num_experts,
block_size,
fully_sharded=True,
offset=offset,
)
if column_parallel:
output = tensor_model_parallel_all_gather(output)
else:
output = tensor_model_parallel_all_reduce(output)
torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1)

View File

@@ -1,46 +0,0 @@
import vllm
from vllm.lora.request import LoRARequest
MODEL_PATH = "google/gemma-7b"
def do_sample(llm, lora_path: str, lora_id: int) -> str:
prompts = [
"Quote: Imagination is",
"Quote: Be yourself;",
"Quote: So many books,",
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4)
expected_lora_output = [
"more important than knowledge.\nAuthor: Albert Einstein\n",
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
"so little time\nAuthor: Frank Zappa\n",
]
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i].startswith(expected_lora_output[i])
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i].startswith(expected_lora_output[i])

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.lora.request import LoRARequest
from ..utils import multi_gpu_test
MODEL_PATH = "openai/gpt-oss-20b"
PROMPT_TEMPLATE = """<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-10-29
Reasoning: medium
# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
"
##Instruction:
farm contains tables such as city, farm, farm_competition, competition_record. Table city has columns such as City_ID, Official_Name, Status, Area_km_2, Population, Census_Ranking. City_ID is the primary key.
Table farm has columns such as Farm_ID, Year, Total_Horses, Working_Horses, Total_Cattle, Oxen, Bulls, Cows, Pigs, Sheep_and_Goats. Farm_ID is the primary key.
Table farm_competition has columns such as Competition_ID, Year, Theme, Host_city_ID, Hosts. Competition_ID is the primary key.
Table competition_record has columns such as Competition_ID, Farm_ID, Rank. Competition_ID is the primary key.
The Host_city_ID of farm_competition is the foreign key of City_ID of city.
The Farm_ID of competition_record is the foreign key of Farm_ID of farm.
The Competition_ID of competition_record is the foreign key of Competition_ID of farm_competition.
###Input:
{context}
###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
]
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
prompts = [
PROMPT_TEMPLATE.format(
context="Give the average number of working horses on farms with more than 5000 total horses." # noqa: E501
), # noqa: E501
PROMPT_TEMPLATE.format(
context="What are the maximum and minimum number of cows across all farms."
),
PROMPT_TEMPLATE.format(
context="Return the maximum and minimum number of cows across all farms."
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
def test_gpt_oss_lora(gptoss20b_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=8,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=16,
tensor_parallel_size=2,
fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)

View File

@@ -1,106 +0,0 @@
import tempfile
from random import sample
from typing import List, Optional
import peft
import pytest
from transformers import AutoModelForCausalLM
import vllm
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
PROMPTS = [
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
]
def get_lora_model(model_id: str, target_modules: List[str], rank: int):
model = AutoModelForCausalLM.from_pretrained(model_id)
lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
lora_model = peft.PeftModel(model, lora_config)
return lora_model
def do_sample(llm,
lora_path: Optional[str] = None,
lora_id: Optional[int] = None,
logprobs: int = 0,
n_tokens: int = 256):
prompts = PROMPTS
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=n_tokens,
logprobs=logprobs,
stop=["[/assistant]"])
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts = []
generated_logprobs = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
generated_logprobs.append([
list(logprob.keys()) for out in output.outputs
for logprob in out.logprobs
])
return generated_logprobs if logprobs else generated_texts
SUPPORTED_MODULES = [
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
"lm_head"
]
TARGET_MODULES_LIST = []
for length in range(2, 6):
TARGET_MODULES_LIST.extend(
[sample(SUPPORTED_MODULES, length) for _ in range(3)])
# Test the correctness when layer and rank are varied
# step 1: init a base model and serve with LoRA to get the reference results
# step 2: merge the same LoRA to the base model, serve the merged model
# step 3: compare the results from step 1 and step 2
@pytest.mark.parametrize("tp_size", [1])
@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
@pytest.mark.parametrize("rank", [8, 16, 32, 64])
def test_layer_variation_correctness(tp_size, target_modules, rank):
llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=tp_size,
worker_use_ray=True)
model = get_lora_model(MODEL_PATH, target_modules, rank)
with tempfile.TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
del llm
cleanup()
reference_id_sets = [set(prob[0]) for prob in merged_probs]
model = get_lora_model(MODEL_PATH, target_modules, rank)
with tempfile.TemporaryDirectory() as tmpdir:
merged_model = model.merge_and_unload()
merged_model.save_pretrained(tmpdir)
llm = vllm.LLM(tmpdir,
tokenizer=MODEL_PATH,
enable_lora=False,
max_num_seqs=16,
tensor_parallel_size=tp_size,
worker_use_ray=True)
probs = do_sample(llm, logprobs=5, n_tokens=32)
del llm
cleanup()
# verify the top-5 tokens are identical for each token
id_sets = [set(prob[0]) for prob in probs]
assert id_sets == reference_id_sets

File diff suppressed because it is too large Load Diff

View File

@@ -1,148 +0,0 @@
import pytest
import ray
import vllm
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
def do_sample(llm, lora_path: str, lora_id: int):
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=256,
stop=["[/assistant]"])
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.mark.parametrize("tp_size", [1])
def test_llama_lora(sql_lora_files, tp_size):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=tp_size)
expected_no_lora_output = [
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501
"\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501
]
expected_lora_output = [
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501
]
print("lora adapter created")
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
print("lora 1")
assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
print("no lora")
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
print("lora 2")
assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
print("removing lora")
@pytest.mark.skip("Requires multiple GPUs")
def test_llama_tensor_parallel_equality(sql_lora_files):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
llm_tp1 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1)
output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
del llm_tp1
cleanup()
llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2)
output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
del llm_tp2
cleanup()
assert output_tp1 == output_tp2
llm_tp4 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=4)
output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
del llm_tp4
cleanup()
assert output_tp1 == output_tp4
def test_llama_lora_warmup(sql_lora_files):
"""Test that the LLM initialization works with a warmup LORA path and
is more conservative"""
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_lora():
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
return num_gpu_blocks_lora_warmup
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_no_lora():
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
num_gpu_blocks_no_lora_warmup = (
llm.llm_engine.cache_config.num_gpu_blocks)
return num_gpu_blocks_no_lora_warmup
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
num_gpu_blocks_no_lora_warmup = ray.get(
get_num_gpu_blocks_no_lora.remote())
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
"The warmup with lora should be more "
"conservative than without lora, therefore the number of "
"memory blocks for the KV cache should be "
"less when using lora than when not using lora")

231
tests/lora/test_llama_tp.py Normal file
View File

@@ -0,0 +1,231 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import sys
import pytest
import vllm
import vllm.config
from vllm import LLM
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
"
##Instruction:
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
The People_ID of candidate is the foreign key of People_ID of people.
###Input:
{context}
###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM candidate",
"SELECT count(*) FROM candidate",
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
]
MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
def do_sample(
llm: vllm.LLM,
lora_path: str,
lora_id: int,
tensorizer_config_dict: dict | None = None,
) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
PROMPT_TEMPLATE.format(
context="Which poll resource provided the most number of candidate information?" # noqa: E501
),
PROMPT_TEMPLATE.format(
context="Return the poll resource associated with the most candidates."
),
]
sampling_params = vllm.SamplingParams(
temperature=0, max_tokens=64, stop=["<|im_end|>"]
)
if tensorizer_config_dict is not None:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(
str(lora_id),
lora_id,
lora_path,
tensorizer_config_dict=tensorizer_config_dict,
)
if lora_id
else None,
)
else:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id
else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def generate_and_test(
llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
):
print("lora adapter created")
print("lora 1")
assert (
do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1,
)
== EXPECTED_LORA_OUTPUT
)
print("lora 2")
assert (
do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2,
)
== EXPECTED_LORA_OUTPUT
)
print("removing lora")
@create_new_process_for_each_test()
@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=7,
max_model_len=1024,
max_loras=4,
compilation_config=vllm.config.CompilationConfig(
cudagraph_specialize_lora=cudagraph_specialize_lora,
),
)
generate_and_test(llm, llama32_lora_files)
@multi_gpu_test(num_gpus=4)
def test_llama_lora_tp4(llama32_lora_files):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=7,
max_model_len=1024,
max_loras=4,
tensor_parallel_size=4,
)
generate_and_test(llm, llama32_lora_files)
@multi_gpu_test(num_gpus=4)
def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=8,
max_loras=4,
max_model_len=1024,
tensor_parallel_size=4,
fully_sharded_loras=True,
)
generate_and_test(llm, llama32_lora_files)
@multi_gpu_test(num_gpus=2)
def test_tp2_serialize_and_deserialize_lora(
tmp_path,
llama32_lora_files,
):
# Run the tensorizing of the LoRA adapter and the model in a subprocess
# to guarantee cleanup
tp_size = 2
model_name = "model-rank-%03d.tensors"
model_ref = MODEL_PATH
lora_path = llama32_lora_files
suffix = "test"
try:
result = subprocess.run(
[
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
"--model",
MODEL_PATH,
"--lora-path",
lora_path,
"--tensor-parallel-size",
str(tp_size),
"serialize",
"--serialized-directory",
str(tmp_path),
"--suffix",
suffix,
"--serialization-kwargs",
'{"limit_cpu_concurrency": 4}',
],
check=True,
capture_output=True,
text=True,
)
except subprocess.CalledProcessError as e:
print("Tensorizing failed.")
print("STDOUT:\n", e.stdout)
print("STDERR:\n", e.stderr)
raise
print("STDOUT:\n", result.stdout)
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
loaded_llm = LLM(
model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=7,
max_model_len=1024,
tensor_parallel_size=2,
max_loras=2,
)
tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created")
print("lora 1")
assert (
do_sample(
loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
)
== EXPECTED_LORA_OUTPUT
)

View File

@@ -0,0 +1,187 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script contains:
1. test multi loras service with tp >= 2
2. test multi loras request
"""
import pytest
from tests.utils import multi_gpu_test
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
MODEL_PATH = "Qwen/Qwen3-0.6B"
LORA_NAME_PATH_MAP = {
"Alice": "charent/self_cognition_Alice",
"Bob": "charent/self_cognition_Bob",
"Cat": "charent/self_cognition_Bob", # same as Bob
}
LORA_NAME_ID_MAP = {}
INCREASE_LORA_ID = 0
LORA_RANK = 8
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
LORA_TEST_EXPECTED = [
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.",
]
def format_chatml_messages(prompt: str):
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
]
def make_add_lora_request(name: str, path: str):
global INCREASE_LORA_ID, LORA_NAME_ID_MAP
INCREASE_LORA_ID += 1
LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
return LoRARequest(
lora_name=name,
lora_int_id=INCREASE_LORA_ID,
lora_path=path,
)
@multi_gpu_test(num_gpus=2)
def test_multi_loras_with_tp_sync():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2, # ensure max_loras < max_cpu_loras
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
tensor_parallel_size=2, # ensure tp >= 2
max_cpu_loras=4, # ensure max_cpu_loras >= 2
)
def run_check_lora(fn, args, expected: list):
fn(args)
assert set(llm.llm_engine.list_loras()) == set(expected)
# simulate add loras with CLI args
# likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
[1],
)
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
[1, 2],
)
run_check_lora(
llm.llm_engine.add_lora,
make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
[1, 2, 3],
)
# set temperature = 0 for greedy search
sampling_params = SamplingParams(temperature=0, max_tokens=64)
def call_llm_get_outputs(prompt: str, lora_name: str):
lora_request = LoRARequest(
lora_name=lora_name,
lora_int_id=LORA_NAME_ID_MAP[lora_name],
lora_path=LORA_NAME_PATH_MAP[lora_name],
)
messages = format_chatml_messages(prompt)
outputs = llm.chat(
[messages],
sampling_params,
chat_template_kwargs={
"enable_thinking": False
}, # for those loras, ensure enable_thinking=False
lora_request=lora_request,
use_tqdm=False,
)
output_text = outputs[0].outputs[0].text
return output_text
def reload_lora(name: str):
"""
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
for dynamic lora loading and unloading
"""
remove_lora_response = llm.llm_engine.remove_lora(
lora_id=LORA_NAME_ID_MAP[name]
)
add_lora_response = llm.llm_engine.add_lora(
make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
)
print(f"{remove_lora_response=}, {add_lora_response=}")
def check_outputs(outputs: str, expected: str):
print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
print("\n----------------------------\n")
assert outputs == expected
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# call Bob, ignore what it is output
call_llm_get_outputs(prompt, "Bob")
print("After call Bob:")
# call Alice
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# reload Bob Lora
reload_lora("Bob")
print("After reload Bob:")
# call Alice
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
# reload Alice Lora
reload_lora("Alice")
print("After reload Alice:")
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
def test_multiple_lora_requests():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=4,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
PROMPTS = ["Hello, my name is"] * 2
LORA_NAME = "Alice"
lora_request = [
LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, lora_request=lora_request)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
# Single LoRARequest should be applied to every prompt
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)

View File

@@ -1,224 +0,0 @@
import pytest
import torch
from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
from .utils import DummyLoRAManager
TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
QKV_TENSOR_SIZES = [
(8192, 1024, 1024),
(8192 // 8, 1024 // 8, 1024 // 8),
(4096, 4096, 4096),
(4096 // 2, 4096 // 2, 4096 // 2),
]
BATCH_SIZES = [8, 32, 256]
RANKS = [8]
DTYPES = [torch.float16]
TOLERANCES = {
torch.float16: (5e-3, 5e-3),
torch.bfloat16: (3e-2, 2e-2),
}
@pytest.mark.parametrize("m", TENSOR_SIZES)
@pytest.mark.parametrize("n", TENSOR_SIZES)
@pytest.mark.parametrize("k", BATCH_SIZES)
@pytest.mark.parametrize("rank", RANKS)
@pytest.mark.parametrize("dtype", DTYPES)
def test_apply_lora(m, n, k, rank, dtype) -> None:
manager = DummyLoRAManager()
module_name = "module"
weight = torch.rand([m, n], device="cuda", dtype=dtype)
manager.init_random_lora(module_name, weight, rank=rank)
lora = manager.get_module_lora(module_name)
input = torch.rand(k, n, device="cuda", dtype=dtype)
expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
lora_a_stack = torch.zeros(8,
1,
lora.lora_a.shape[1],
lora.lora_a.shape[0],
device="cuda",
dtype=dtype)
lora_b_stack = torch.zeros(8,
1,
lora.lora_b.shape[1],
lora.lora_b.shape[0],
device="cuda",
dtype=dtype)
for i in range(lora_a_stack.shape[0]):
lora_a_stack[i][0] = lora.lora_a.T
lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
output = torch.zeros(k, m, device="cuda", dtype=dtype)
_apply_lora(
input, lora_a_stack, lora_b_stack,
torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
output)
rtol, atol = TOLERANCES[dtype]
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
output[:] = 0
_apply_lora(input, lora_a_stack, lora_b_stack,
torch.full((len(input), ), -1, device="cuda"), output)
assert torch.allclose(torch.zeros_like(output), output)
manager.reset_lora()
@pytest.mark.parametrize("m", TENSOR_SIZES)
@pytest.mark.parametrize("n", TENSOR_SIZES)
@pytest.mark.parametrize("k", BATCH_SIZES)
@pytest.mark.parametrize("rank", RANKS)
@pytest.mark.parametrize("dtype", DTYPES)
def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
if m % 2 != 0:
pytest.skip("m must be divisible by 2")
if m // 2 not in TENSOR_SIZES:
pytest.skip("m//2 must be in TENSOR_SIZES")
manager = DummyLoRAManager()
module_name = "module"
weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
manager.init_random_lora(module_name + "1", weight, rank=rank)
lora_1 = manager.get_module_lora(module_name + "1")
manager.init_random_lora(module_name + "2", weight, rank=rank)
lora_2 = manager.get_module_lora(module_name + "2")
input = torch.rand(k, n, device="cuda", dtype=dtype)
expected = torch.cat([
input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
],
dim=1)
lora_a_stacks = [
torch.zeros(8,
1,
lora_1.lora_a.shape[1],
lora_1.lora_a.shape[0],
device="cuda",
dtype=dtype) for i in range(2)
]
lora_b_stacks = [
torch.zeros(8,
1,
lora_1.lora_b.shape[1],
lora_1.lora_b.shape[0],
device="cuda",
dtype=dtype) for i in range(2)
]
for i in range(lora_a_stacks[0].shape[0]):
lora_a_stacks[0][i][0] = lora_1.lora_a.T
lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
lora_a_stacks[1][i][0] = lora_2.lora_a.T
lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
output = torch.zeros(k, m, device="cuda", dtype=dtype)
_apply_lora_packed_nslice(
input, lora_a_stacks, lora_b_stacks,
torch.randint(0,
lora_a_stacks[0].shape[0], (len(input), ),
device="cuda"), output, (m // 2, m // 2))
rtol, atol = TOLERANCES[dtype]
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
output[:] = 0
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
torch.full((len(input), ), -1, device="cuda"),
output, (m // 2, m // 2))
assert torch.allclose(torch.zeros_like(output), output)
manager.reset_lora()
@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
@pytest.mark.parametrize("n", TENSOR_SIZES)
@pytest.mark.parametrize("k", BATCH_SIZES)
@pytest.mark.parametrize("rank", RANKS)
@pytest.mark.parametrize("dtype", DTYPES)
def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
manager = DummyLoRAManager()
module_name = "module"
weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
manager.init_random_lora(module_name + "q", weight_q, rank=rank)
lora_q = manager.get_module_lora(module_name + "q")
manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
lora_k = manager.get_module_lora(module_name + "k")
manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
lora_v = manager.get_module_lora(module_name + "v")
input = torch.rand(k, n, device="cuda", dtype=dtype)
expected = torch.cat([
input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
],
dim=1)
lora_a_stacks = [
torch.zeros(8,
1,
lora_q.lora_a.shape[1],
lora_q.lora_a.shape[0],
device="cuda",
dtype=dtype)
] + [
torch.zeros(8,
1,
lora_k.lora_a.shape[1],
lora_k.lora_a.shape[0],
device="cuda",
dtype=dtype) for i in range(2)
]
lora_b_stacks = [
torch.zeros(8,
1,
lora_q.lora_b.shape[1],
lora_q.lora_b.shape[0],
device="cuda",
dtype=dtype)
] + [
torch.zeros(8,
1,
lora_k.lora_b.shape[1],
lora_k.lora_b.shape[0],
device="cuda",
dtype=dtype) for i in range(2)
]
for i in range(lora_a_stacks[0].shape[0]):
lora_a_stacks[0][i][0] = lora_q.lora_a.T
lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
lora_a_stacks[1][i][0] = lora_k.lora_a.T
lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
lora_a_stacks[2][i][0] = lora_v.lora_a.T
lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
_apply_lora_packed_nslice(
input, lora_a_stacks, lora_b_stacks,
torch.randint(0,
lora_a_stacks[0].shape[0], (len(input), ),
device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
rtol, atol = TOLERANCES[dtype]
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
output[:] = 0
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
torch.full((len(input), ), -1, device="cuda"),
output, (qkv[0], qkv[1], qkv[2]))
assert torch.allclose(torch.zeros_like(output), output)
manager.reset_lora()

View File

@@ -1,9 +1,20 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.lora.models import LoRAModel
from vllm.lora.lora_model import LoRAModel
from vllm.lora.peft_helper import PEFTHelper
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.utils import WeightsMapper
lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
BAICHUAN_LORA_MODULES = [
"W_pack",
"o_proj",
"gate_up_proj",
"down_proj",
]
@pytest.mark.parametrize("lora_name", lora_lst)
@@ -11,48 +22,109 @@ def test_load_checkpoints(
lora_name,
baichuan_lora_files,
baichuan_zero_lora_files,
baichuan_regex_lora_files,
chatglm3_lora_files,
):
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
expected_lora_modules = []
for module in supported_lora_modules:
expected_lora_lst: list[str] = []
for module in BAICHUAN_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
expected_lora_lst.extend(packed_modules_mapping[module])
else:
expected_lora_modules.append(module)
expected_lora_lst.append(module)
expected_lora_modules = set(expected_lora_lst)
if lora_name == "baichuan7B":
peft_helper = PEFTHelper.from_local_dir(
baichuan_lora_files, max_position_embeddings=4096
)
# For the baichuan7B model, load it's LoRA,
# and the test should pass.
LoRAModel.from_local_checkpoint(
baichuan_lora_files,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
model_vocab_size=64000,
)
elif lora_name == "baichuan7B-zero":
#Test that the target_modules contain prefix
# Test that the target_modules contain prefix
# such as "model.layers.0.self_atten.W_pack", and
# the test should pass.
peft_helper = PEFTHelper.from_local_dir(
baichuan_zero_lora_files, max_position_embeddings=4096
)
LoRAModel.from_local_checkpoint(
baichuan_zero_lora_files,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
model_vocab_size=64000,
)
elif lora_name == "baichuan7B-zero-regex":
# Test that the `target_modules` in the form of regular expressions,
# such as `model\\..*(W_pack|o_proj)`, and the test should pass.
peft_helper = PEFTHelper.from_local_dir(
baichuan_regex_lora_files, max_position_embeddings=4096
)
LoRAModel.from_local_checkpoint(
baichuan_regex_lora_files,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
model_vocab_size=64000,
)
else:
# For the baichuan7B model, load chatglm3-6b's LoRA,
# and the test should raise the following error.
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
peft_helper = PEFTHelper.from_local_dir(
chatglm3_lora_files, max_position_embeddings=4096
)
with pytest.raises(ValueError, match=expected_error):
LoRAModel.from_local_checkpoint(
chatglm3_lora_files,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
model_vocab_size=64000,
)
def test_lora_weights_mapping(baichuan_lora_files):
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
expected_lora_lst: list[str] = []
for module in BAICHUAN_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_lst.extend(packed_modules_mapping[module])
else:
expected_lora_lst.append(module)
expected_lora_modules = set(expected_lora_lst)
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.": "language_model.model.",
},
orig_to_new_substr={
".layers.": ".baichuan_layers.",
},
)
peft_helper = PEFTHelper.from_local_dir(
baichuan_lora_files, max_position_embeddings=4096
)
lora_model = LoRAModel.from_local_checkpoint(
baichuan_lora_files,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
model_vocab_size=64000,
weights_mapper=hf_to_vllm_mapper,
)
for name in lora_model.loras:
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
assert ".baichuan_layers." in name

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
"""
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.lora.request import LoRARequest
from vllm.v1.engine.llm_engine import LLMEngine
MODEL_PATH = "Qwen/Qwen3-0.6B"
LORA_MODULE_PATH = "charent/self_cognition_Alice"
LORA_RANK = 8
def make_lora_request(lora_id: int):
return LoRARequest(
lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
)
def test_lora_functions_sync():
max_loras = 4
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = EngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True,
)
llm = LLMEngine.from_engine_args(engine_args)
def run_check(fn, args, expected: list):
fn(args)
assert set(llm.list_loras()) == set(expected)
run_check(llm.add_lora, make_lora_request(1), [1])
run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
run_check(llm.pin_lora, 1, [1, 2])
run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
run_check(llm.remove_lora, 1, [8, 9, 10])
run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs.
run_check(llm.remove_lora, 13, [12, 10, 11])
run_check(llm.remove_lora, 12, [10, 11])
run_check(llm.remove_lora, 11, [10])
run_check(llm.remove_lora, 10, [])
@pytest.mark.asyncio
async def test_lora_functions_async():
max_loras = 4
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True,
)
async def run_check(fn, args, expected: list):
await fn(args)
assert set(await llm.list_loras()) == set(expected)
async with build_async_engine_client_from_engine_args(engine_args) as llm:
await run_check(llm.add_lora, make_lora_request(1), [1])
await run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
await run_check(llm.pin_lora, 1, [1, 2])
await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
await run_check(llm.remove_lora, 1, [8, 9, 10])
await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs
await run_check(llm.remove_lora, 13, [12, 10, 11])
await run_check(llm.remove_lora, 12, [10, 11])
await run_check(llm.remove_lora, 11, [10])
await run_check(llm.remove_lora, 10, [])

View File

@@ -0,0 +1,48 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.lora.lora_model import LoRAModel
from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import get_adapter_absolute_path
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
# Provide absolute path and huggingface lora ids
lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
LLAMA_LORA_MODULES = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_name = request.getfixturevalue(lora_fixture_name)
packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
expected_lora_lst: list[str] = []
for module in LLAMA_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_lst.extend(packed_modules_mapping[module])
else:
expected_lora_lst.append(module)
expected_lora_modules = set(expected_lora_lst)
lora_path = get_adapter_absolute_path(lora_name)
# lora loading should work for either absolute path and huggingface id.
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
lora_model = LoRAModel.from_local_checkpoint(
lora_path,
expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1,
device="cpu",
)
# Assertions to ensure the model is loaded correctly
assert lora_model is not None, "LoRAModel is not loaded correctly"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,121 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import multi_gpu_test
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
PROMPT_TEMPLATE = (
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
"(<image>./</image>)\nWhat is in the image?<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
)
IMAGE_ASSETS = [
ImageAsset("stop_sign"),
]
# After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [
"A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
stop_token_ids=[128001, 128009], # eos_id, eot_id
)
inputs = [
{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in IMAGE_ASSETS
]
outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Generated text: {generated_text!r}")
return generated_texts
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
enforce_eager=True,
max_model_len=2048,
limit_mm_per_prompt={"image": 2, "video": 0},
trust_remote_code=True,
)
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=2,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": 2, "video": 0},
trust_remote_code=True,
)
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=2,
max_loras=2,
max_lora_rank=8,
tensor_parallel_size=4,
trust_remote_code=True,
limit_mm_per_prompt={"image": 1, "video": 0},
fully_sharded_loras=True,
)
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])

View File

@@ -1,26 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import vllm
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
def do_sample(llm, lora_path: str, lora_id: int):
prompts = [
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
]
def do_sample(
llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
) -> list[str]:
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts = []
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
@@ -31,23 +32,46 @@ def do_sample(llm, lora_path: str, lora_id: int):
@pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora(mixtral_lora_files, tp_size):
if torch.cuda.device_count() < tp_size:
"""Original test, the LoRA model has the common target modules, not all"""
if (
torch.cuda.device_count() < tp_size
and tp_size > 1
and current_platform.is_cuda_alike()
):
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=tp_size,
worker_use_ray=True)
expected_lora_output = [
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
prompts = [
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
]
assert do_sample(llm, mixtral_lora_files,
lora_id=1) == expected_lora_output
assert do_sample(llm, mixtral_lora_files,
lora_id=2) == expected_lora_output
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
)
expected_lora_output = [
[
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])" # noqa: E501
],
[
"give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
],
[
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])" # noqa: E501
],
]
def check_outputs(generated: list[str]):
assert len(generated) == len(expected_lora_output)
for gen, gt_choices in zip(generated, expected_lora_output):
assert gen in gt_choices
check_outputs(do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts))
check_outputs(do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts))

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
import torch
from vllm import _custom_ops as ops
def round_up(x, base):
return ((x + base - 1) // base) * base
def CEILDIV(x, y):
return (x + y - 1) // y
def sample_data(num_experts, max_loras, num_tokens, topk_num):
topk_ids = torch.zeros((num_tokens, topk_num), dtype=torch.int32)
token_lora_mapping = torch.zeros((num_tokens,), dtype=torch.int32)
for i in range(num_tokens):
pool = list(range(num_experts))
random.shuffle(pool)
for j in range(topk_num):
topk_ids[i, j] = pool[j]
token_lora_mapping[i] = random.randint(0, max_loras - 1)
return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920
@pytest.mark.parametrize("topk_num", [6])
@pytest.mark.parametrize("num_experts", [64, 128, 256, 512])
@pytest.mark.parametrize("max_loras", [2, 32])
@pytest.mark.parametrize("block_size", [16])
def test_moe_lora_align_block_size(
num_tokens, topk_num, num_experts, max_loras, block_size
):
# sample data
random.seed(1)
topk_ids, token_lora_mapping = sample_data(
num_experts, max_loras, num_tokens, topk_num
)
# compute paddings
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
# init output tensors
sorted_token_ids = torch.full(
(max_loras * max_num_tokens_padded,),
topk_ids.numel(),
dtype=torch.int32,
device="cuda",
)
expert_ids = torch.full(
(max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
)
num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
# call kernel
ops.moe_lora_align_block_size(
topk_ids,
token_lora_mapping,
num_experts,
block_size,
max_loras,
max_num_tokens_padded,
max_num_m_blocks,
sorted_token_ids,
expert_ids,
num_tokens_post_pad,
adapter_enabled,
lora_ids,
)
# verify values
expert_ids = expert_ids.view(max_loras, -1)
sorted_token_ids = sorted_token_ids.view(max_loras, -1, block_size)
for lora_idx in range(max_loras):
for token_idx in range(sorted_token_ids.size(1)):
block = sorted_token_ids[lora_idx][token_idx]
indices = block[block != topk_ids.numel()]
if indices.numel() > 0:
expert_id = expert_ids[lora_idx][token_idx]
assert torch.all(topk_ids.view(-1)[indices] == expert_id)
if __name__ == "__main__":
pytest.main([__file__])

163
tests/lora/test_olmoe_tp.py Normal file
View File

@@ -0,0 +1,163 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.lora.request import LoRARequest
from ..utils import multi_gpu_test
MODEL_PATH = "allenai/OLMoE-1B-7B-0125-Instruct"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
"
##Instruction:
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
The People_ID of candidate is the foreign key of People_ID of people.
###Input:
{context}
###Response:""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM candidate",
"SELECT count(*) FROM candidate",
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
]
EXPECTED_BASE_MODEL_OUTPUT = [
"SELECT COUNT(Candidate_ID) FROM candidate",
"SELECT COUNT(Candidate_ID) FROM candidate",
"SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID", # noqa: E501
"SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501
]
def generate_and_test(
llm: vllm.LLM,
lora_path: str,
lora_id: list[int | None] | int | None,
compare_lower: bool = False,
) -> None:
prompts = [
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
PROMPT_TEMPLATE.format(
context="Which poll resource provided the most number of candidate information?" # noqa: E501
),
PROMPT_TEMPLATE.format(
context="Return the poll resource associated with the most candidates."
),
]
lora_request = None
if isinstance(lora_id, int):
lora_request = LoRARequest(str(lora_id), lora_id, lora_path)
elif isinstance(lora_id, list):
lora_request = [
LoRARequest(str(i), i, lora_path) if i is not None else None
for i in lora_id
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
for i in range(len(EXPECTED_LORA_OUTPUT)):
req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
generated_text = generated_texts[i]
expected_output = (
EXPECTED_LORA_OUTPUT[i]
if req_lora_id is not None
else EXPECTED_BASE_MODEL_OUTPUT[i]
)
if compare_lower:
generated_text = generated_text.lower()
expected_output = expected_output.lower()
assert generated_text.startswith(expected_output)
def test_olmoe_lora(olmoe_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, olmoe_lora_files, lora_id=1)
generate_and_test(llm, olmoe_lora_files, lora_id=2)
def test_olmoe_lora_mixed(olmoe_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@multi_gpu_test(num_gpus=2)
def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
tensor_parallel_size=2,
fully_sharded_loras=fully_sharded_loras,
)
generate_and_test(llm, olmoe_lora_files, lora_id=1)
generate_and_test(llm, olmoe_lora_files, lora_id=2)
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
@multi_gpu_test(num_gpus=4)
def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
tensor_parallel_size=4,
fully_sharded_loras=fully_sharded_loras,
)
generate_and_test(
llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
)
generate_and_test(
llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
)

View File

@@ -0,0 +1,99 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import math
import shutil
import pytest
from vllm.config.lora import LoRAConfig
from vllm.lora.peft_helper import PEFTHelper
ERROR_CASES = [
(
"test_rank",
{"r": 1024},
"is greater than max_lora_rank",
),
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
(
"test_modules_to_save",
{"modules_to_save": ["lm_head"]},
"only supports modules_to_save being None",
),
]
def test_peft_helper_pass(llama32_lora_files, tmp_path):
peft_helper = PEFTHelper.from_local_dir(
llama32_lora_files, max_position_embeddings=4096
)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
peft_helper.validate_legal(lora_config)
assert peft_helper.r == 8
assert peft_helper.lora_alpha == 32
target_modules = sorted(peft_helper.target_modules)
assert target_modules == [
"down_proj",
"embed_tokens",
"gate_proj",
"k_proj",
"lm_head",
"o_proj",
"q_proj",
"up_proj",
"v_proj",
]
assert peft_helper.vllm_max_position_embeddings == 4096
# test RSLoRA
rslora_config = dict(use_rslora=True)
test_dir = tmp_path / "test_rslora"
shutil.copytree(llama32_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(rslora_config)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
peft_helper.validate_legal(lora_config)
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
def test_peft_helper_error(
llama32_lora_files,
tmp_path,
test_name: str,
config_change: dict,
expected_error: str,
):
test_dir = tmp_path / test_name
shutil.copytree(llama32_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(config_change)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
# Test loading the adapter
with pytest.raises(ValueError, match=expected_error):
PEFTHelper.from_local_dir(
test_dir, max_position_embeddings=4096
).validate_legal(lora_config)

View File

@@ -1,231 +0,0 @@
# Based on code from https://github.com/punica-ai/punica
import pytest
import torch
import vllm.lora.punica as punica
def assert_close(a, b):
rtol, atol = {
torch.float16: (5e-3, 5e-3),
torch.bfloat16: (3e-2, 2e-2),
torch.float32: (None, None),
}[a.dtype]
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
def _lora_ref_impl(
y_final: torch.Tensor,
x: torch.Tensor,
wa_T_all: torch.Tensor,
wb_T_all: torch.Tensor,
indicies: torch.LongTensor,
layer_idx: int,
scale: float,
):
y_stage_1 = torch.empty(
(x.size(0), wa_T_all.size(-2)),
dtype=torch.float32,
device=x.device,
)
bs = x.shape[0]
s = torch.tensor(scale, dtype=torch.float32, device=x.device)
for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
xi = x[i].unsqueeze(0).to(torch.float32)
wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
if wb_T_all is not None:
wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-2).to(torch.float32)
tmp = xi @ wa
y_stage_1[i] = tmp.squeeze(0)
y_final[i] += ((tmp @ wb).squeeze(0) *
s if wb_T_all is not None else y_stage_1[i])
return y_final, y_stage_1
H1 = H2 = [
128,
256,
512,
1024,
1152,
1280,
1536,
2048,
2304,
2560,
2752,
3072,
3456,
3584,
4096,
4608,
5120,
5504,
5632,
6144,
6848,
6912,
7168,
8192,
9216,
10240,
11008,
13824,
14336,
15360,
22016,
24576,
27392,
32000,
32256,
32512,
32768,
33024,
36864,
43264,
49152,
64000,
64256,
102400,
102656,
128000,
128256,
]
H2 = [64] + H2
R = [1, 2, 4]
SEED = [0xabcdabcd987]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
@pytest.mark.parametrize("h1", H1)
@pytest.mark.parametrize("r", R)
@pytest.mark.parametrize("seed", SEED)
@torch.inference_mode()
def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
torch.manual_seed(seed)
num_loras = 4
num_layers = 1
bs = 32
dtype = getattr(torch, dtype_str)
device = torch.device("cuda")
wa_T_all = torch.randn(num_loras,
num_layers,
r,
h1,
dtype=dtype,
device=device)
indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
for layer_idx in range(num_layers):
x = torch.randn(bs, h1, dtype=dtype, device=device)
y = torch.randn(bs, r, dtype=dtype, device=device)
y_ref = y.clone()
_lora_ref_impl(
y_ref,
x,
wa_T_all,
None,
indices,
layer_idx,
1.0,
)
y_our = y.clone()
punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
assert_close(y_ref, y_our)
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
@pytest.mark.parametrize("h1", H1)
@pytest.mark.parametrize("h2", H2)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_lora_correctness(dtype_str, h1, h2, seed, device):
torch.manual_seed(seed)
num_loras = 4
num_layers = 1
r = 8
bs = 32
scale = 0.123
dtype = getattr(torch, dtype_str)
torch.set_default_device(device)
wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
for layer_idx in range(num_layers):
x = torch.randn(bs, h1, dtype=dtype)
y = torch.randn(bs, h2, dtype=dtype)
y_ref = y.clone()
_lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
y_our = y.clone()
punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
scale)
assert_close(y_ref, y_our)
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
@pytest.mark.parametrize("h1", H1)
@pytest.mark.parametrize("h2", H2)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
if h2 % 3 != 0 or h2 // 3 not in H1:
pytest.skip("h2 must be divisible by 3 and in supported shapes")
torch.manual_seed(seed)
num_loras = 4
num_layers = 1
r = 8
bs = 32
scale = 0.123
dtype = getattr(torch, dtype_str)
torch.set_default_device(device)
wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
for layer_idx in range(num_layers):
x = torch.randn(bs, h1, dtype=dtype)
y = torch.randn(bs, h2, dtype=dtype)
s = h2 // 3
y_ref = y.clone()
_lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
layer_idx, scale)
_lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
layer_idx, scale)
_lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
layer_idx, scale)
y_our = y.clone()
punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
layer_idx, scale, 0, s)
punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
layer_idx, scale, s, s)
punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
layer_idx, scale, s * 2, s)
assert_close(y_ref[:, :s], y_our[:, :s])
assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])

View File

@@ -0,0 +1,475 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from threading import Lock
import pytest
import torch
import vllm.lora.ops.torch_ops as torch_ops
import vllm.lora.ops.triton_ops as triton_ops
from vllm.lora.ops.triton_ops import LoRAKernelMeta
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
@pytest.fixture(autouse=True)
def reset_device(reset_default_device):
pass
# Utility shrink and expand operations used as reference implementations.
def sgmv_shrink_for_nslices(
nslices: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor,
batches: int,
max_seq_length: int,
num_tokens: int,
scaling: float,
):
"""
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
"""
for index in range(nslices):
torch_ops.sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
scaling,
)
def sgmv_expand_for_nslices(
nslices: int,
hidden_size: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor,
batches: int,
max_seq_length: int,
num_tokens: int,
add_inputs: bool,
) -> None:
"""
Wrapper around torch_ops.sgmv_expand that handles any nslices.
"""
if nslices == 1:
# Verify the torch's sgmv_expand op
torch_ops.sgmv_expand(
inputs_tensor[0],
lora_weights_lst[0],
out_tensor,
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
add_inputs=add_inputs,
)
else:
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
torch_ops.sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
out_tensor,
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
slice_offset,
hidden_size,
add_inputs=add_inputs,
)
slice_offset += hidden_size
_dict_lock = Lock()
def check_lora_shrink_kernel(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seq_length: int,
scaling: float,
):
"""
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
kernels.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
"shrink",
device,
)
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
)
lora_meta.prepare_tensors(data.token_lora_mapping)
ref_out_tensor = data.ref_out_tensor
out_tensor = data.our_out_tensor.clone()
# Preventing cache error pointer.
with _dict_lock:
# lora_shrink kernel
_LORA_A_PTR_DICT.clear()
triton_ops.lora_shrink(
data.inputs_tensor,
data.lora_weights,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
scaling,
)
# Reference
sgmv_shrink_for_nslices(
nslices,
data.inputs_tensor,
data.lora_weights,
ref_out_tensor,
*sgmv_meta_args,
scaling,
)
assert_close(out_tensor, ref_out_tensor)
def check_lora_expand_kernel(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seq_length: int,
add_inputs: bool,
):
"""
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
kernels.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
"expand",
device,
)
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
)
lora_meta.prepare_tensors(data.token_lora_mapping)
# Setup output tensors
ref_out_tensor = data.ref_out_tensor
out_tensor = data.our_out_tensor.clone()
with _dict_lock:
# lora_expand kernel
_LORA_B_PTR_DICT.clear()
triton_ops.lora_expand(
data.inputs_tensor,
data.lora_weights,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
offset_start=0,
add_inputs=add_inputs,
)
# Reference
sgmv_expand_for_nslices(
nslices,
hidden_size,
data.inputs_tensor,
data.lora_weights,
ref_out_tensor,
*sgmv_meta_args,
add_inputs=add_inputs,
)
assert_close(out_tensor, ref_out_tensor)
# Tests
# We test the punica kernels along 2 verticals mainly.
# 1. Variations in hidden_dim size
# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
# etc.)
# We have collected the hidden_sizes included in the LoRA models
# currently supported by vLLM. It tests whether the corresponding Triton
# kernel can run normally when tensor parallelism is set to
# [1, 2, 4, 8, 16, 32, 64].
HIDDEN_SIZES = [
128,
256,
512,
896,
1024,
1152,
1216,
1280,
1536,
1664,
2048,
2240,
2304,
2368,
2432,
2560,
2752,
3072,
3328,
3456,
3584,
3712,
4096,
4480,
4608,
4736,
4864,
5120,
5504,
5632,
5888,
6144,
6400,
6848,
6912,
7168,
7424,
8192,
8960,
9216,
9472,
10240,
11008,
11264,
13824,
14336,
14784,
14848,
15360,
18944,
22016,
22528,
24576,
27392,
27648,
29568,
29696,
32000,
32256,
32512,
32768,
33024,
36864,
43264,
49152,
49408,
60544,
60672,
64000,
64256,
102400,
102656,
128000,
128256,
]
# The size of TP
divisibility = [1, 2, 8, 16, 64]
all_hidden_size = []
for div in divisibility:
for hidden_size in HIDDEN_SIZES:
all_hidden_size.append(hidden_size // div)
HIDDEN_SIZES = list(set(all_hidden_size))
# Test params that focuses on hidden_size variation.
hs_test_params = {
"hidden_sizes": HIDDEN_SIZES,
"batches": [4],
"num_loras": [4],
"max_ranks": [32],
}
# General tests params that tests for variations in all dimensions
# except hidden_size.
test_params = {
"hidden_sizes": [2049],
"batches": [1, 4, 16, 32],
"num_loras": [1, 8, 32, 128],
"max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
}
DTYPES = [torch.float16, torch.bfloat16]
DEVICES = [f"cuda:{0}"]
SEED = [0]
@pytest.mark.parametrize("batches", test_params["batches"])
@pytest.mark.parametrize("num_loras", test_params["num_loras"])
@pytest.mark.parametrize("rank", test_params["max_ranks"])
@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_kernels(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
"""
Tests LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_lora_shrink_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5,
)
else:
check_lora_expand_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True,
)
@pytest.mark.parametrize("batches", hs_test_params["batches"])
@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_kernels_hidden_size(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
"""
Tests SGMV and LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_lora_shrink_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5,
)
else:
check_lora_expand_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True,
)

View File

@@ -1,14 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
from dataclasses import dataclass
from typing import List
import pytest
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
from .conftest import cleanup
from vllm.platforms import current_platform
@dataclass
@@ -17,15 +19,28 @@ class ModelWithQuantization:
quantization: str
MODELS: List[ModelWithQuantization] = [
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
MODELS: list[ModelWithQuantization]
# AWQ quantization is currently not supported in ROCm.
if current_platform.is_rocm():
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
),
]
else:
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
),
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
),
]
def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
def do_sample(
llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
) -> list[str]:
raw_prompts = [
"Give me an orange-ish brown color",
"Give me a neon pink color",
@@ -36,16 +51,16 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
prompts = [format_prompt_tuples(p) for p in raw_prompts]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=max_tokens,
stop=["<|im_end|>"])
sampling_params = vllm.SamplingParams(
temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts = []
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
@@ -55,44 +70,31 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
tensor_parallel_size=tp_size,
quantization=model.quantization,
trust_remote_code=True)
def test_quant_model_lora(tinyllama_lora_files, model):
llm = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
trust_remote_code=True,
enable_chunked_prefill=True,
tokenizer=tinyllama_lora_files,
)
if model.quantization is None:
expected_no_lora_output = [
"Here are some examples of orange-brown colors",
"I'm sorry, I don't have"
]
expected_lora_output = [
"#ff8050",
"#ff8080",
]
elif model.quantization == "AWQ":
expected_no_lora_output = [
"I'm sorry, I don't understand",
"I'm sorry, I don't understand",
]
elif model.quantization == "awq":
expected_lora_output = [
"#f07700: A v",
"#f00000: A v",
]
elif model.quantization == "GPTQ":
expected_no_lora_output = [
"I'm sorry, I don't have",
"I'm sorry, I don't have",
]
elif model.quantization == "gptq":
expected_lora_output = [
"#f08800: This is",
"#f07788 \n#",
@@ -101,79 +103,65 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
def expect_match(output, expected_output):
# HACK: GPTQ lora outputs are just incredibly unstable.
# Assert that the outputs changed.
if (model.quantization == "GPTQ"
and expected_output is expected_lora_output):
assert output != expected_no_lora_output
if model.quantization == "gptq" and expected_output is expected_lora_output:
for i, o in enumerate(output):
assert o.startswith(
'#'), f"Expected example {i} to start with # but got {o}"
assert o.startswith("#"), (
f"Expected example {i} to start with # but got {o}"
)
return
assert output == expected_output
max_tokens = 10
print("lora adapter created")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=0,
max_tokens=max_tokens)
expect_match(output, expected_no_lora_output)
print("lora 1")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=1,
max_tokens=max_tokens)
output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
expect_match(output, expected_lora_output)
print("no lora")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=0,
max_tokens=max_tokens)
expect_match(output, expected_no_lora_output)
print("lora 2")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=2,
max_tokens=max_tokens)
output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
expect_match(output, expected_lora_output)
print("removing lora")
del llm
cleanup()
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.skip("Requires multiple GPUs")
def test_quant_model_tp_equality(tinyllama_lora_files, model):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 2:
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
llm_tp1 = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1,
quantization=model.quantization,
trust_remote_code=True)
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
if num_gpus_available < 2:
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
if model.quantization == "gptq":
pytest.skip("GPTQ lora outputs are just incredibly unstable")
llm_tp1 = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
del llm_tp1
cleanup()
cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2,
quantization=model.quantization)
llm_tp2 = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2,
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
enable_chunked_prefill=True,
)
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
del llm_tp2
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2

177
tests/lora/test_qwen2vl.py Normal file
View File

@@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.sampling_params import BeamSearchParams
@dataclass
class TestConfig:
model_path: str
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
mm_processor_kwargs: dict[str, int] | None = None
def __post_init__(self):
if self.mm_processor_kwargs is None:
self.mm_processor_kwargs = {
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
}
class Qwen2VLTester:
"""Test helper for Qwen2 VL models with LoRA"""
PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n"
)
def __init__(self, config: TestConfig):
self.config = config
self.llm = self._initialize_llm()
def _initialize_llm(self) -> vllm.LLM:
"""Initialize the LLM with given configuration"""
return vllm.LLM(
model=self.config.model_path,
max_num_seqs=self.config.max_num_seqs,
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
trust_remote_code=True,
mm_processor_kwargs=self.config.mm_processor_kwargs,
max_model_len=self.config.max_model_len,
)
def run_test(
self,
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: int | None = None,
temperature: float = 0,
max_tokens: int = 5,
):
sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
)
inputs = [
{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in images
]
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
generated_texts = [output.outputs[0].text.strip() for output in outputs]
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(generated), (
f"Generated text {generated} doesn't "
)
f"match expected pattern {expected}"
def run_beam_search_test(
self,
images: list[ImageAsset],
expected_outputs: list[list[str]],
lora_id: int | None = None,
temperature: float = 0,
beam_width: int = 2,
max_tokens: int = 5,
):
beam_search_params = BeamSearchParams(
beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
)
inputs = [
{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in images
]
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
outputs = self.llm.beam_search(
inputs, beam_search_params, lora_request=lora_request
)
for output_obj, expected_outs in zip(outputs, expected_outputs):
output_texts = [seq.text for seq in output_obj.sequences]
assert output_texts == expected_outs, (
f"Generated texts {output_texts} do not match expected {expected_outs}"
) # noqa: E501
TEST_IMAGES = [
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
]
EXPECTED_OUTPUTS = [
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501
],
]
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
def test_qwen2vl_lora(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA through beam search."""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
# NOTE currently, we only test cherry blossom since stop sign
# output is slightly different for v1; - the root cause is likely
# independent of the intent of this test, which is to ensure beam
# search passes through lora through correctly.
tester.run_beam_search_test(
[ImageAsset("cherry_blossom")],
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
lora_id=lora_id,
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)

View File

@@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# NOTE To avoid overloading the CI pipeline, this test script will not
# be triggered on CI and is primarily intended for local testing and verification.
import vllm
from vllm.lora.request import LoRARequest
from ..utils import multi_gpu_test
MODEL_PATH = "Qwen/Qwen3-30B-A3B"
PROMPT_TEMPLATE = """<|im_start|>user
I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
"
##Instruction:
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
The People_ID of candidate is the foreign key of People_ID of people.
###Input:
{context}
###Response:<|im_end|>
<|im_start|>assistant""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
"<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
"<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
"<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
]
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
prompts = [
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
PROMPT_TEMPLATE.format(
context="Which poll resource provided the most number of candidate information?" # noqa: E501
),
PROMPT_TEMPLATE.format(
context="Return the poll resource associated with the most candidates."
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
def test_qwen3moe_lora(qwen3moe_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
@multi_gpu_test(num_gpus=2)
def test_qwen3moe_lora_tp2(qwen3moe_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
tensor_parallel_size=2,
)
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
@multi_gpu_test(num_gpus=4)
def test_qwen3moe_lora_tp4(qwen3moe_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
tensor_parallel_size=4,
)
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
class DummyLoRAResolver(LoRAResolver):
"""A dummy LoRA resolver for testing."""
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> LoRARequest | None:
if lora_name == "test_lora":
return LoRARequest(
lora_name=lora_name,
lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
lora_int_id=abs(hash(lora_name)),
)
return None
def test_resolver_registry_registration():
"""Test basic resolver registration functionality."""
registry = LoRAResolverRegistry
resolver = DummyLoRAResolver()
# Register a new resolver
registry.register_resolver("dummy", resolver)
assert "dummy" in registry.get_supported_resolvers()
# Get registered resolver
retrieved_resolver = registry.get_resolver("dummy")
assert retrieved_resolver is resolver
def test_resolver_registry_duplicate_registration():
"""Test registering a resolver with an existing name."""
registry = LoRAResolverRegistry
resolver1 = DummyLoRAResolver()
resolver2 = DummyLoRAResolver()
registry.register_resolver("dummy", resolver1)
registry.register_resolver("dummy", resolver2)
assert registry.get_resolver("dummy") is resolver2
def test_resolver_registry_unknown_resolver():
"""Test getting a non-existent resolver."""
registry = LoRAResolverRegistry
with pytest.raises(KeyError, match="not found"):
registry.get_resolver("unknown_resolver")
@pytest.mark.asyncio
async def test_dummy_resolver_resolve():
"""Test the dummy resolver's resolve functionality."""
dummy_resolver = DummyLoRAResolver()
base_model_name = "base_model_test"
lora_name = "test_lora"
# Test successful resolution
result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
assert isinstance(result, LoRARequest)
assert result.lora_name == lora_name
assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
# Test failed resolution
result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
assert result is None

View File

@@ -1,55 +0,0 @@
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from ..conftest import get_tokenizer_pool_config
@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2",
enable_lora=True,
max_num_seqs=1,
max_input_length=None,
)
lora_request = LoRARequest("1", 1, sql_lora_files)
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
request_id="request_id", prompt="prompt", lora_request=lora_request)
assert reference_tokenizer.encode(
"prompt") == await tokenizer_group.encode_async(
request_id="request_id",
prompt="prompt",
lora_request=lora_request)
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
PreTrainedTokenizerBase)
assert tokenizer_group.get_lora_tokenizer(
None) == await tokenizer_group.get_lora_tokenizer_async(None)
assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
PreTrainedTokenizerBase)
assert tokenizer_group.get_lora_tokenizer(
lora_request) != tokenizer_group.get_lora_tokenizer(None)
assert tokenizer_group.get_lora_tokenizer(
lora_request) == await tokenizer_group.get_lora_tokenizer_async(
lora_request)
def test_get_lora_tokenizer(sql_lora_files, tmpdir):
lora_request = None
tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer
lora_request = LoRARequest("1", 1, sql_lora_files)
tokenizer = get_lora_tokenizer(lora_request)
assert tokenizer.get_added_vocab()
lora_request = LoRARequest("1", 1, str(tmpdir))
tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "hmellor/Ilama-3.2-1B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
"SELECT DISTINCT Country FROM singer WHERE Age > 20",
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_ilama_lora(ilama_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4(ilama_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -1,58 +1,141 @@
from collections import OrderedDict
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections import OrderedDict
from typing import NamedTuple
from unittest.mock import patch
import pytest
from huggingface_hub.utils import HfHubHTTPError
from torch import nn
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
from vllm.utils import LRUCache
from vllm.lora.utils import (
get_adapter_absolute_path,
parse_fine_tuned_lora_name,
replace_submodule,
)
from vllm.model_executor.models.utils import WeightsMapper
def test_parse_fine_tuned_lora_name():
fixture = {
("base_model.model.lm_head.lora_A.weight", "lm_head", True),
("base_model.model.lm_head.lora_B.weight", "lm_head", False),
(
class LoRANameParserTestConfig(NamedTuple):
name: str
module_name: str
is_lora_a: bool
weights_mapper: WeightsMapper | None = None
def test_parse_fine_tuned_lora_name_valid():
fixture = [
LoRANameParserTestConfig(
"base_model.model.lm_head.lora_A.weight", "lm_head", True, False
),
LoRANameParserTestConfig(
"base_model.model.lm_head.lora_B.weight", "lm_head", False, False
),
LoRANameParserTestConfig(
"base_model.model.model.embed_tokens.lora_embedding_A",
"model.embed_tokens",
True,
),
(
LoRANameParserTestConfig(
"base_model.model.model.embed_tokens.lora_embedding_B",
"model.embed_tokens",
False,
),
(
LoRANameParserTestConfig(
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
"model.layers.9.mlp.down_proj",
True,
),
(
LoRANameParserTestConfig(
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
"model.layers.9.mlp.down_proj",
False,
),
LoRANameParserTestConfig(
"language_model.layers.9.mlp.down_proj.lora_A.weight",
"language_model.layers.9.mlp.down_proj",
True,
),
LoRANameParserTestConfig(
"language_model.layers.9.mlp.down_proj.lora_B.weight",
"language_model.layers.9.mlp.down_proj",
False,
),
# Test with WeightsMapper
LoRANameParserTestConfig(
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
"language_model.model.layers.9.mlp.down_proj",
True,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
"language_model.model.layers.9.mlp.down_proj",
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"model.layers.9.mlp.down_proj.lora_A.weight",
"language_model.model.layers.9.mlp.down_proj",
True,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"model.layers.9.mlp.down_proj.lora_B.weight",
"language_model.model.layers.9.mlp.down_proj",
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}
),
),
]
for name, module_name, is_lora_a, weights_mapper in fixture:
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(
name, weights_mapper
)
def test_parse_fine_tuned_lora_name_invalid():
fixture = {
"base_model.weight",
"base_model.model.weight",
}
for name, module_name, is_lora_a in fixture:
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
for name in fixture:
with pytest.raises(ValueError, match="unsupported LoRA weight"):
parse_fine_tuned_lora_name(name)
def test_replace_submodule():
model = nn.Sequential(
OrderedDict([
("dense1", nn.Linear(764, 100)),
("act1", nn.ReLU()),
("dense2", nn.Linear(100, 50)),
(
"seq1",
nn.Sequential(
OrderedDict([
("dense1", nn.Linear(100, 10)),
("dense2", nn.Linear(10, 50)),
])),
),
("act2", nn.ReLU()),
("output", nn.Linear(50, 10)),
("outact", nn.Sigmoid()),
]))
OrderedDict(
[
("dense1", nn.Linear(764, 100)),
("act1", nn.ReLU()),
("dense2", nn.Linear(100, 50)),
(
"seq1",
nn.Sequential(
OrderedDict(
[
("dense1", nn.Linear(100, 10)),
("dense2", nn.Linear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("output", nn.Linear(50, 10)),
("outact", nn.Sigmoid()),
]
)
)
sigmoid = nn.Sigmoid()
@@ -64,109 +147,52 @@ def test_replace_submodule():
assert dict(model.named_modules())["seq1.dense2"] == dense2
class TestLRUCache(LRUCache):
def _on_remove(self, key, value):
if not hasattr(self, "_remove_counter"):
self._remove_counter = 0
self._remove_counter += 1
# Unit tests for get_adapter_absolute_path
@patch("os.path.isabs")
def test_get_adapter_absolute_path_absolute(mock_isabs):
path = "/absolute/path/to/lora"
mock_isabs.return_value = True
assert get_adapter_absolute_path(path) == path
def test_lru_cache():
cache = TestLRUCache(3)
@patch("os.path.expanduser")
def test_get_adapter_absolute_path_expanduser(mock_expanduser):
# Path with ~ that needs to be expanded
path = "~/relative/path/to/lora"
absolute_path = "/home/user/relative/path/to/lora"
mock_expanduser.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
cache.put(1, 1)
assert len(cache) == 1
cache.put(1, 1)
assert len(cache) == 1
@patch("os.path.exists")
@patch("os.path.abspath")
def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
# Relative path that exists locally
path = "relative/path/to/lora"
absolute_path = "/absolute/path/to/lora"
mock_exist.return_value = True
mock_abspath.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
cache.put(2, 2)
assert len(cache) == 2
cache.put(3, 3)
assert len(cache) == 3
assert set(cache.cache) == {1, 2, 3}
@patch("huggingface_hub.snapshot_download")
@patch("os.path.exists")
def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
# Hugging Face model identifier
path = "org/repo"
absolute_path = "/mock/snapshot/path"
mock_exist.return_value = False
mock_snapshot_download.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
cache.put(4, 4)
assert len(cache) == 3
assert set(cache.cache) == {2, 3, 4}
assert cache._remove_counter == 1
assert cache.get(2) == 2
cache.put(5, 5)
assert set(cache.cache) == {2, 4, 5}
assert cache._remove_counter == 2
assert cache.pop(5) == 5
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.pop(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.get(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.put(6, 6)
assert len(cache) == 3
assert set(cache.cache) == {2, 4, 6}
assert 2 in cache
assert 4 in cache
assert 6 in cache
cache.remove_oldest()
assert len(cache) == 2
assert set(cache.cache) == {2, 6}
assert cache._remove_counter == 4
cache.clear()
assert len(cache) == 0
assert cache._remove_counter == 6
cache._remove_counter = 0
cache[1] = 1
assert len(cache) == 1
cache[1] = 1
assert len(cache) == 1
cache[2] = 2
assert len(cache) == 2
cache[3] = 3
assert len(cache) == 3
assert set(cache.cache) == {1, 2, 3}
cache[4] = 4
assert len(cache) == 3
assert set(cache.cache) == {2, 3, 4}
assert cache._remove_counter == 1
assert cache[2] == 2
cache[5] = 5
assert set(cache.cache) == {2, 4, 5}
assert cache._remove_counter == 2
del cache[5]
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.pop(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache[6] = 6
assert len(cache) == 3
assert set(cache.cache) == {2, 4, 6}
assert 2 in cache
assert 4 in cache
assert 6 in cache
@patch("huggingface_hub.snapshot_download")
@patch("os.path.exists")
def test_get_adapter_absolute_path_huggingface_error(
mock_exist, mock_snapshot_download
):
# Hugging Face model identifier with download error
path = "org/repo"
mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
assert get_adapter_absolute_path(path) == path

View File

@@ -1,69 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import tempfile
from unittest.mock import patch
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig)
from vllm.lora.models import LoRAMapping
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.model_manager import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.worker.worker import Worker
from vllm.v1.worker.gpu_worker import Worker
MODEL_PATH = "Qwen/Qwen3-0.6B"
NUM_LORAS = 16
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
worker = Worker(
model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
),
def test_worker_apply_lora(qwen3_lora_files):
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
lora_mapping = LoRAMapping([], [])
worker.model_runner.lora_manager.set_active_adapters(
lora_requests, lora_mapping
)
model_config = ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
)
vllm_config = VllmConfig(
model_config=model_config,
load_config=LoadConfig(
download_dir=None,
load_format="dummy",
),
parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32),
parallel_config=ParallelConfig(
pipeline_parallel_size=1,
tensor_parallel_size=1,
data_parallel_size=1,
),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
runner_type="generate",
max_num_batched_tokens=32,
max_num_seqs=32,
max_num_partial_prefills=32,
),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(block_size=16,
gpu_memory_utilization=1.,
swap_space=0,
cache_dtype="auto"),
cache_config=CacheConfig(
block_size=16,
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(
max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
),
)
worker = Worker(
vllm_config=vllm_config,
local_rank=0,
rank=0,
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
max_loras=32),
distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
)
worker.init_device()
worker.load_model()
worker.model_runner.set_active_loras([], LoRAMapping([], []))
set_active_loras(worker, [])
assert worker.list_loras() == set()
n_loras = 32
lora_requests = [
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
]
worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
set_active_loras(worker, lora_requests)
assert worker.list_loras() == {
lora_request.lora_int_id
for lora_request in lora_requests
lora_request.lora_int_id for lora_request in lora_requests
}
for i in range(32):
for i in range(NUM_LORAS):
random.seed(i)
iter_lora_requests = random.choices(lora_requests,
k=random.randint(1, n_loras))
iter_lora_requests = random.choices(
lora_requests, k=random.randint(1, NUM_LORAS)
)
random.shuffle(iter_lora_requests)
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
worker.model_runner.set_active_loras(iter_lora_requests,
LoRAMapping([], []))
iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
set_active_loras(worker, lora_requests)
assert worker.list_loras().issuperset(
{lora_request.lora_int_id
for lora_request in iter_lora_requests})
{lora_request.lora_int_id for lora_request in iter_lora_requests}
)

View File

@@ -1,60 +1,64 @@
from typing import List, Optional
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
from dataclasses import dataclass
import torch
from safetensors.torch import save_file
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
class DummyLoRAManager:
def __init__(self):
def __init__(self, device: torch.device = "cuda:0"):
super().__init__()
self._loras = {}
self._loras: dict[str, LoRALayerWeights] = {}
self._device = device
def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
self._loras[module_name] = lora
def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
return self._loras.get(module_name, None)
def get_module_lora(self, module_name: str) -> LoRALayerWeights:
return self._loras[module_name]
def init_random_lora(self,
module_name: str,
weight: torch.Tensor,
rank: int = 8,
generate_embeddings_tensor: int = 0):
def init_random_lora(
self,
module_name: str,
weight: torch.Tensor,
rank: int = 8,
):
lora = LoRALayerWeights(
module_name,
rank=rank,
lora_alpha=1,
lora_a=torch.rand([weight.shape[1], rank],
dtype=weight.dtype,
device="cuda"),
lora_b=torch.rand([rank, weight.shape[0]],
dtype=weight.dtype,
device="cuda"),
lora_a=torch.rand(
[rank, weight.shape[1]], dtype=weight.dtype, device=self._device
),
lora_b=torch.rand(
[weight.shape[0], rank], dtype=weight.dtype, device=self._device
),
)
if generate_embeddings_tensor:
lora.embeddings_tensor = torch.rand(5,
generate_embeddings_tensor,
dtype=weight.dtype,
device="cuda")
self.set_module_lora(module_name, lora)
return lora
def init_lora(self,
module_name: str,
input_dim: int,
output_dim: int,
rank=8,
noop=False,
embeddings_tensor=None):
def init_lora(
self,
module_name: str,
input_dim: int,
output_dim: int,
rank=8,
noop=False,
embeddings_tensor=None,
):
lora = LoRALayerWeights(
module_name,
rank=rank,
lora_alpha=1,
lora_a=torch.rand([input_dim, rank], device="cuda"),
lora_b=torch.rand([rank, output_dim], device="cuda"),
lora_a=torch.rand([rank, input_dim], device="cuda"),
lora_b=torch.rand([output_dim, input_dim], device="cuda"),
embeddings_tensor=embeddings_tensor,
)
self.set_module_lora(module_name, lora)
@@ -67,12 +71,12 @@ class DummyLoRAManager:
self,
module_name: str,
input_dim: int,
output_dims: List[int],
noop_lora_index: List[int] = None,
rank=8,
output_dims: list[int],
noop_lora_index: list[int] | None = None,
rank: int = 8,
):
base_loras = []
noop_lora_index = set(noop_lora_index or [])
base_loras: list[LoRALayerWeights] = []
noop_lora_index_set = set(noop_lora_index or [])
for i, out_dim in enumerate(output_dims):
base_lora = self.init_lora(
@@ -80,9 +84,324 @@ class DummyLoRAManager:
input_dim,
out_dim,
rank=rank,
noop=i in noop_lora_index,
noop=i in noop_lora_index_set,
)
base_loras.append(base_lora)
packed_lora = PackedLoRALayerWeights.pack(base_loras)
self.set_module_lora(module_name, packed_lora)
return packed_lora
def assert_close(a, b):
rtol, atol = {
torch.float16: (6e-2, 6e-2),
torch.bfloat16: (6e-2, 6e-2),
torch.float32: (1e-2, 1e-2),
}[a.dtype]
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
@dataclass
class PunicaTensors:
inputs_tensor: torch.Tensor
lora_weights: torch.Tensor | list[torch.Tensor]
our_out_tensor: torch.Tensor
ref_out_tensor: torch.Tensor
b_seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor
seq_len_tensor: torch.Tensor
token_lora_mapping: torch.Tensor
def meta(self) -> tuple[int, int]:
"""
Infer max_seq_length and token_nums from the tensors
and return them.
"""
max_seq_length = self.seq_len_tensor.max()
token_nums = self.seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
return max_seq_length, token_nums
def generate_data(
batches,
hidden_size,
lora_nums,
max_rank,
seq_length,
dtype,
op_type,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
).to(device)
total_tokens = seq_len_tensor.sum()
if op_type == "shrink":
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
lora_weights = torch.rand(
(lora_nums, max_rank, hidden_size), # col-major
dtype=dtype,
).to(device)
# shrink op need atomic_add, so output is initinized by 0
ref_out_tensor = torch.zeros(
(total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
)
# NOTE shrink kernel using torch.float32 as output type
our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
device
)
else:
inputs_tensor = torch.rand(
(total_tokens, max_rank),
dtype=dtype,
).to(device)
lora_weights = torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
ref_out_tensor = torch.rand(
(total_tokens, hidden_size),
dtype=dtype,
).to(device)
# Ensure the same input.
our_out_tensor = ref_out_tensor.clone()
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
).to(device)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
lora_index
)
current_offset += seq_len_tensor[b_id].item()
return PunicaTensors(
inputs_tensor,
lora_weights,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
)
def generate_data_for_expand_nslices(
batches,
hidden_size,
lora_nums,
max_rank,
seq_length,
dtype,
nslices,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
).to(device)
total_tokens = seq_len_tensor.sum()
inputs_tensor = torch.rand(
(total_tokens, max_rank),
dtype=dtype,
).to(device)
lora_weights_lst = []
for _ in range(nslices):
lora_weights_lst.append(
torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device)
)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
device
)
# Ensure the same input.
our_out_tensor = ref_out_tensor.clone()
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
lora_index.item()
)
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
return PunicaTensors(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
)
def generate_data_for_nslices(
batches,
hidden_size,
lora_nums,
max_rank,
seq_length,
nslices,
dtype,
op_type,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
).to(device)
total_tokens = seq_len_tensor.sum()
lora_weights_lst = []
if op_type == "shrink":
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
for _ in range(nslices):
if op_type == "shrink":
lora_weights_lst.append(
torch.rand(
(lora_nums, max_rank, hidden_size), # col-major
dtype=dtype,
).to(device)
)
# NOTE shrink kernel using torch.float32 as output type
# shrink op need atomic_add, so output is initinized by 0
our_out_tensor = torch.zeros(
(nslices, total_tokens, max_rank),
dtype=torch.float32,
).to(device)
else:
inputs_tensor = torch.rand(
(nslices, total_tokens, max_rank),
dtype=dtype,
).to(device)
for _ in range(nslices):
lora_weights_lst.append(
torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device)
)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
our_out_tensor = torch.rand(
(total_tokens, hidden_size * nslices), dtype=dtype
).to(device)
# Ensure the same input.
ref_out_tensor = our_out_tensor.clone()
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
lora_index.item()
)
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
return PunicaTensors(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
)
def create_peft_lora(
model: torch.nn.Module,
save_dir: str,
target_modules: list[str],
rank: int = 8,
alpha: int = 16,
dropout: float = 0.1,
lora_dtype: torch.dtype = torch.float16,
) -> dict[str, torch.Tensor]:
lora_weights = {}
adapter_config = {
"peft_type": "LORA",
"auto_mapping": None,
"base_model_name_or_path": "dummy_model",
"revision": None,
"task_type": "CAUSAL_LM",
"inference_mode": False,
"r": rank,
"lora_alpha": alpha,
"lora_dropout": dropout,
"fan_in_fan_out": False,
"bias": "none",
"modules_to_save": None,
"init_lora_weights": True,
"layers_to_transform": None,
"layers_pattern": None,
"target_modules": target_modules,
"exclude_modules": None,
"use_rslora": False,
"use_dora": False,
"loftq_config": None,
}
for module_name in target_modules:
module = model
for attr in module_name.split("."):
module = getattr(module, attr)
if hasattr(module, "input_size") and hasattr(module, "output_size"):
in_features = module.input_size
out_features = module.output_size
elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
# ParallelLMHead
in_features = module.embedding_dim
out_features = module.num_embeddings
else:
raise ValueError(f"Unable to determine dimensions for module {module_name}")
lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
torch.nn.init.kaiming_uniform_(lora_A, a=5**0.5)
lora_B = torch.zeros(out_features, rank, dtype=lora_dtype)
# PEFT style
lora_weights[f"base_model.model.{module_name}.lora_A.weight"] = lora_A
lora_weights[f"base_model.model.{module_name}.lora_B.weight"] = lora_B
config_path = os.path.join(save_dir, "adapter_config.json")
with open(config_path, "w", encoding="utf-8") as f:
json.dump(adapter_config, f, indent=2, ensure_ascii=False)
weights_path = os.path.join(save_dir, "adapter_model.safetensors")
save_file(lora_weights, weights_path)
return lora_weights