Support start up LoRA server without initial adapters (#8019)

2025-07-19 15:38:09 -07:00
parent 60468da4e2
commit 4e3defe5a7
12 changed files with 290 additions and 195 deletions
--- a/test/srt/models/lora/test_lora_update.py
+++ b/test/srt/models/lora/test_lora_update.py
@@ -64,8 +64,9 @@ class TestCase:
    base: str
    max_loras_per_batch: int
    all_adapters: List[str]
-    initial_adapters: List[str]
    op_sequence: List[Operation]
+    initial_adapters: Optional[List[str]] = None
+    enable_lora: Optional[bool] = None
    max_lora_rank: Optional[int] = None
    lora_target_modules: Optional[List] = None
    max_new_tokens: int = 32
@@ -171,6 +172,64 @@ BASIC_TESTS = [
            ),
        ],
    ),
+    TestCase(
+        description="dynamic lora update without initial lora_paths",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        enable_lora=True,
+        max_lora_rank=256,
+        lora_target_modules=["all"],
+        max_loras_per_batch=4,
+        all_adapters=[
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            "pbevan11/llama-3.1-8b-ocr-correction",
+        ],
+        op_sequence=[
+            Operation(
+                type=OperationType.LOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        None,
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
    TestCase(
        description="dynamic lora update with evictions",
        base="meta-llama/Llama-3.1-8B-Instruct",
@@ -371,7 +430,7 @@ TARGET_MODULE_TESTS = [
            Operation(
                type=OperationType.LOAD,
                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
-                expected_error="updating LoRA shapes",
+                expected_error="incompatible",
            ),
            Operation(
                type=OperationType.FORWARD,
@@ -431,7 +490,7 @@ MAX_LORA_RANK_TESTS = [
            Operation(
                type=OperationType.LOAD,
                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
-                expected_error="updating LoRA shapes",
+                expected_error="incompatible",
            ),
            Operation(
                type=OperationType.FORWARD,
@@ -470,7 +529,7 @@ MAX_LORA_RANK_TESTS = [
            Operation(
                type=OperationType.LOAD,
                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
-                expected_error="updating LoRA shapes",
+                expected_error="incompatible",
            ),
            Operation(
                type=OperationType.FORWARD,
@@ -521,6 +580,7 @@ class LoRAUpdateTestSessionBase:
        lora_paths: list[str],
        max_loras_per_batch: int,
        max_lora_rank: Optional[int],
+        enable_lora: Optional[bool] = None,
        lora_target_modules: Optional[List[str]] = None,
        lora_backend: str = "triton",
        disable_cuda_graph: bool = False,
@@ -535,8 +595,9 @@ class LoRAUpdateTestSessionBase:
        self.lora_backend = lora_backend
        self.disable_cuda_graph = disable_cuda_graph
        self.cuda_graph_max_bs = cuda_graph_max_bs
+        self.enable_lora = enable_lora

-        self.expected_adapters = set(lora_paths)
+        self.expected_adapters = set(lora_paths or [])
        self.handle = None  # Will be set in __enter__

    def __enter__(self):
@@ -596,6 +657,7 @@ class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
            disable_cuda_graph=self.disable_cuda_graph,
            cuda_graph_max_bs=self.cuda_graph_max_bs,
            disable_radix_cache=True,
+            enable_lora=self.enable_lora,
        )
        self.handle.__enter__()
        return self
@@ -690,8 +752,6 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
        other_args = [
            "--cuda-graph-max-bs",
            str(self.cuda_graph_max_bs),
-            "--lora-paths",
-            *self.lora_paths,
            "--max-loras-per-batch",
            str(self.max_loras_per_batch),
            "--lora-backend",
@@ -704,6 +764,10 @@ class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
            "--mem-fraction-static",
            str(MEM_FRACTION_STATIC),
        ]
+        if self.enable_lora:
+            other_args.append("--enable-lora")
+        if self.lora_paths:
+            other_args.extend(["--lora-paths"] + self.lora_paths)
        if self.disable_cuda_graph:
            other_args.append("--disable-cuda-graph")
        if self.max_lora_rank is not None:
@@ -836,6 +900,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
        initial_adapters: List[str],
        max_loras_per_batch: int,
        op_sequence: List[Operation],
+        enable_lora: Optional[bool] = None,
        max_lora_rank: Optional[int] = None,
        lora_target_modules: Optional[List[str]] = None,
        max_new_tokens: int = 32,
@@ -854,6 +919,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
            max_loras_per_batch=max_loras_per_batch,
            max_lora_rank=max_lora_rank,
            lora_target_modules=lora_target_modules,
+            enable_lora=enable_lora,
        ) as session:
            for op in op_sequence:
                op_type = op.type
@@ -903,6 +969,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
            dynamic_output = self._run_operation_sequence(
                mode=mode,
                initial_adapters=test_case.initial_adapters,
+                enable_lora=test_case.enable_lora,
                base=test_case.base,
                max_loras_per_batch=test_case.max_loras_per_batch,
                op_sequence=test_case.op_sequence,
@@ -923,6 +990,7 @@ class TestLoRADynamicUpdate(CustomTestCase):
            static_output = self._run_operation_sequence(
                mode=mode,
                initial_adapters=test_case.all_adapters,
+                enable_lora=test_case.enable_lora,
                base=test_case.base,
                max_loras_per_batch=test_case.max_loras_per_batch,
                op_sequence=forward_ops,
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -18,7 +18,7 @@ suites = {
        TestFile("models/lora/test_lora_backend.py", 99),
        TestFile("models/lora/test_multi_lora_backend.py", 60),
        TestFile("models/lora/test_lora_cuda_graph.py", 250),
-        TestFile("models/lora/test_lora_update.py", 700),
+        TestFile("models/lora/test_lora_update.py", 800),
        TestFile("models/test_embedding_models.py", 73),
        # TestFile("models/test_clip_models.py", 52),
        TestFile("models/test_encoder_embedding_models.py", 100),