Support start up LoRA server without initial adapters (#8019)

This commit is contained in:
Lifu Huang
2025-07-19 15:38:09 -07:00
committed by GitHub
parent 60468da4e2
commit 4e3defe5a7
12 changed files with 290 additions and 195 deletions

View File

@@ -64,8 +64,9 @@ class TestCase:
base: str
max_loras_per_batch: int
all_adapters: List[str]
initial_adapters: List[str]
op_sequence: List[Operation]
initial_adapters: Optional[List[str]] = None
enable_lora: Optional[bool] = None
max_lora_rank: Optional[int] = None
lora_target_modules: Optional[List] = None
max_new_tokens: int = 32
@@ -171,6 +172,64 @@ BASIC_TESTS = [
),
],
),
TestCase(
description="dynamic lora update without initial lora_paths",
base="meta-llama/Llama-3.1-8B-Instruct",
enable_lora=True,
max_lora_rank=256,
lora_target_modules=["all"],
max_loras_per_batch=4,
all_adapters=[
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
"pbevan11/llama-3.1-8b-ocr-correction",
],
op_sequence=[
Operation(
type=OperationType.LOAD,
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
),
Operation(
type=OperationType.LOAD,
data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
),
Operation(
type=OperationType.LOAD,
data="pbevan11/llama-3.1-8b-ocr-correction",
),
Operation(
type=OperationType.FORWARD,
data=create_batch_data(
[
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
"pbevan11/llama-3.1-8b-ocr-correction",
None,
]
),
),
Operation(
type=OperationType.UNLOAD,
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
),
Operation(
type=OperationType.FORWARD,
data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
expected_error="not loaded",
),
Operation(
type=OperationType.FORWARD,
data=create_batch_data(
[
None,
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
"pbevan11/llama-3.1-8b-ocr-correction",
None,
]
),
),
],
),
TestCase(
description="dynamic lora update with evictions",
base="meta-llama/Llama-3.1-8B-Instruct",
@@ -371,7 +430,7 @@ TARGET_MODULE_TESTS = [
Operation(
type=OperationType.LOAD,
data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
expected_error="updating LoRA shapes",
expected_error="incompatible",
),
Operation(
type=OperationType.FORWARD,
@@ -431,7 +490,7 @@ MAX_LORA_RANK_TESTS = [
Operation(
type=OperationType.LOAD,
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
expected_error="updating LoRA shapes",
expected_error="incompatible",
),
Operation(
type=OperationType.FORWARD,
@@ -470,7 +529,7 @@ MAX_LORA_RANK_TESTS = [
Operation(
type=OperationType.LOAD,
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
expected_error="updating LoRA shapes",
expected_error="incompatible",
),
Operation(
type=OperationType.FORWARD,
@@ -521,6 +580,7 @@ class LoRAUpdateTestSessionBase:
lora_paths: list[str],
max_loras_per_batch: int,
max_lora_rank: Optional[int],
enable_lora: Optional[bool] = None,
lora_target_modules: Optional[List[str]] = None,
lora_backend: str = "triton",
disable_cuda_graph: bool = False,
@@ -535,8 +595,9 @@ class LoRAUpdateTestSessionBase:
self.lora_backend = lora_backend
self.disable_cuda_graph = disable_cuda_graph
self.cuda_graph_max_bs = cuda_graph_max_bs
self.enable_lora = enable_lora
self.expected_adapters = set(lora_paths)
self.expected_adapters = set(lora_paths or [])
self.handle = None # Will be set in __enter__
def __enter__(self):
@@ -596,6 +657,7 @@ class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
disable_cuda_graph=self.disable_cuda_graph,
cuda_graph_max_bs=self.cuda_graph_max_bs,
disable_radix_cache=True,
enable_lora=self.enable_lora,
)
self.handle.__enter__()
return self
@@ -690,8 +752,6 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
other_args = [
"--cuda-graph-max-bs",
str(self.cuda_graph_max_bs),
"--lora-paths",
*self.lora_paths,
"--max-loras-per-batch",
str(self.max_loras_per_batch),
"--lora-backend",
@@ -704,6 +764,10 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
"--mem-fraction-static",
str(MEM_FRACTION_STATIC),
]
if self.enable_lora:
other_args.append("--enable-lora")
if self.lora_paths:
other_args.extend(["--lora-paths"] + self.lora_paths)
if self.disable_cuda_graph:
other_args.append("--disable-cuda-graph")
if self.max_lora_rank is not None:
@@ -836,6 +900,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
initial_adapters: List[str],
max_loras_per_batch: int,
op_sequence: List[Operation],
enable_lora: Optional[bool] = None,
max_lora_rank: Optional[int] = None,
lora_target_modules: Optional[List[str]] = None,
max_new_tokens: int = 32,
@@ -854,6 +919,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
max_loras_per_batch=max_loras_per_batch,
max_lora_rank=max_lora_rank,
lora_target_modules=lora_target_modules,
enable_lora=enable_lora,
) as session:
for op in op_sequence:
op_type = op.type
@@ -903,6 +969,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
dynamic_output = self._run_operation_sequence(
mode=mode,
initial_adapters=test_case.initial_adapters,
enable_lora=test_case.enable_lora,
base=test_case.base,
max_loras_per_batch=test_case.max_loras_per_batch,
op_sequence=test_case.op_sequence,
@@ -923,6 +990,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
static_output = self._run_operation_sequence(
mode=mode,
initial_adapters=test_case.all_adapters,
enable_lora=test_case.enable_lora,
base=test_case.base,
max_loras_per_batch=test_case.max_loras_per_batch,
op_sequence=forward_ops,

View File

@@ -18,7 +18,7 @@ suites = {
TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/lora/test_lora_cuda_graph.py", 250),
TestFile("models/lora/test_lora_update.py", 700),
TestFile("models/lora/test_lora_update.py", 800),
TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100),