Support start up LoRA server without initial adapters (#8019)
This commit is contained in:
@@ -64,8 +64,9 @@ class TestCase:
|
||||
base: str
|
||||
max_loras_per_batch: int
|
||||
all_adapters: List[str]
|
||||
initial_adapters: List[str]
|
||||
op_sequence: List[Operation]
|
||||
initial_adapters: Optional[List[str]] = None
|
||||
enable_lora: Optional[bool] = None
|
||||
max_lora_rank: Optional[int] = None
|
||||
lora_target_modules: Optional[List] = None
|
||||
max_new_tokens: int = 32
|
||||
@@ -171,6 +172,64 @@ BASIC_TESTS = [
|
||||
),
|
||||
],
|
||||
),
|
||||
TestCase(
|
||||
description="dynamic lora update without initial lora_paths",
|
||||
base="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enable_lora=True,
|
||||
max_lora_rank=256,
|
||||
lora_target_modules=["all"],
|
||||
max_loras_per_batch=4,
|
||||
all_adapters=[
|
||||
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
|
||||
"pbevan11/llama-3.1-8b-ocr-correction",
|
||||
],
|
||||
op_sequence=[
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="pbevan11/llama-3.1-8b-ocr-correction",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
data=create_batch_data(
|
||||
[
|
||||
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
|
||||
"pbevan11/llama-3.1-8b-ocr-correction",
|
||||
None,
|
||||
]
|
||||
),
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.UNLOAD,
|
||||
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
|
||||
expected_error="not loaded",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
data=create_batch_data(
|
||||
[
|
||||
None,
|
||||
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
|
||||
"pbevan11/llama-3.1-8b-ocr-correction",
|
||||
None,
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
TestCase(
|
||||
description="dynamic lora update with evictions",
|
||||
base="meta-llama/Llama-3.1-8B-Instruct",
|
||||
@@ -371,7 +430,7 @@ TARGET_MODULE_TESTS = [
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
|
||||
expected_error="updating LoRA shapes",
|
||||
expected_error="incompatible",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
@@ -431,7 +490,7 @@ MAX_LORA_RANK_TESTS = [
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
expected_error="updating LoRA shapes",
|
||||
expected_error="incompatible",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
@@ -470,7 +529,7 @@ MAX_LORA_RANK_TESTS = [
|
||||
Operation(
|
||||
type=OperationType.LOAD,
|
||||
data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
|
||||
expected_error="updating LoRA shapes",
|
||||
expected_error="incompatible",
|
||||
),
|
||||
Operation(
|
||||
type=OperationType.FORWARD,
|
||||
@@ -521,6 +580,7 @@ class LoRAUpdateTestSessionBase:
|
||||
lora_paths: list[str],
|
||||
max_loras_per_batch: int,
|
||||
max_lora_rank: Optional[int],
|
||||
enable_lora: Optional[bool] = None,
|
||||
lora_target_modules: Optional[List[str]] = None,
|
||||
lora_backend: str = "triton",
|
||||
disable_cuda_graph: bool = False,
|
||||
@@ -535,8 +595,9 @@ class LoRAUpdateTestSessionBase:
|
||||
self.lora_backend = lora_backend
|
||||
self.disable_cuda_graph = disable_cuda_graph
|
||||
self.cuda_graph_max_bs = cuda_graph_max_bs
|
||||
self.enable_lora = enable_lora
|
||||
|
||||
self.expected_adapters = set(lora_paths)
|
||||
self.expected_adapters = set(lora_paths or [])
|
||||
self.handle = None # Will be set in __enter__
|
||||
|
||||
def __enter__(self):
|
||||
@@ -596,6 +657,7 @@ class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
|
||||
disable_cuda_graph=self.disable_cuda_graph,
|
||||
cuda_graph_max_bs=self.cuda_graph_max_bs,
|
||||
disable_radix_cache=True,
|
||||
enable_lora=self.enable_lora,
|
||||
)
|
||||
self.handle.__enter__()
|
||||
return self
|
||||
@@ -690,8 +752,6 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
|
||||
other_args = [
|
||||
"--cuda-graph-max-bs",
|
||||
str(self.cuda_graph_max_bs),
|
||||
"--lora-paths",
|
||||
*self.lora_paths,
|
||||
"--max-loras-per-batch",
|
||||
str(self.max_loras_per_batch),
|
||||
"--lora-backend",
|
||||
@@ -704,6 +764,10 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
|
||||
"--mem-fraction-static",
|
||||
str(MEM_FRACTION_STATIC),
|
||||
]
|
||||
if self.enable_lora:
|
||||
other_args.append("--enable-lora")
|
||||
if self.lora_paths:
|
||||
other_args.extend(["--lora-paths"] + self.lora_paths)
|
||||
if self.disable_cuda_graph:
|
||||
other_args.append("--disable-cuda-graph")
|
||||
if self.max_lora_rank is not None:
|
||||
@@ -836,6 +900,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
|
||||
initial_adapters: List[str],
|
||||
max_loras_per_batch: int,
|
||||
op_sequence: List[Operation],
|
||||
enable_lora: Optional[bool] = None,
|
||||
max_lora_rank: Optional[int] = None,
|
||||
lora_target_modules: Optional[List[str]] = None,
|
||||
max_new_tokens: int = 32,
|
||||
@@ -854,6 +919,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
|
||||
max_loras_per_batch=max_loras_per_batch,
|
||||
max_lora_rank=max_lora_rank,
|
||||
lora_target_modules=lora_target_modules,
|
||||
enable_lora=enable_lora,
|
||||
) as session:
|
||||
for op in op_sequence:
|
||||
op_type = op.type
|
||||
@@ -903,6 +969,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
|
||||
dynamic_output = self._run_operation_sequence(
|
||||
mode=mode,
|
||||
initial_adapters=test_case.initial_adapters,
|
||||
enable_lora=test_case.enable_lora,
|
||||
base=test_case.base,
|
||||
max_loras_per_batch=test_case.max_loras_per_batch,
|
||||
op_sequence=test_case.op_sequence,
|
||||
@@ -923,6 +990,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
|
||||
static_output = self._run_operation_sequence(
|
||||
mode=mode,
|
||||
initial_adapters=test_case.all_adapters,
|
||||
enable_lora=test_case.enable_lora,
|
||||
base=test_case.base,
|
||||
max_loras_per_batch=test_case.max_loras_per_batch,
|
||||
op_sequence=forward_ops,
|
||||
|
||||
@@ -18,7 +18,7 @@ suites = {
|
||||
TestFile("models/lora/test_lora_backend.py", 99),
|
||||
TestFile("models/lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("models/lora/test_lora_cuda_graph.py", 250),
|
||||
TestFile("models/lora/test_lora_update.py", 700),
|
||||
TestFile("models/lora/test_lora_update.py", 800),
|
||||
TestFile("models/test_embedding_models.py", 73),
|
||||
# TestFile("models/test_clip_models.py", 52),
|
||||
TestFile("models/test_encoder_embedding_models.py", 100),
|
||||
|
||||
Reference in New Issue
Block a user