Support start up LoRA server without initial adapters (#8019)

2025-07-19 15:38:09 -07:00
parent 60468da4e2
commit 4e3defe5a7
12 changed files with 290 additions and 195 deletions
--- a/python/sglang/srt/lora/lora_manager.py
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -186,9 +186,9 @@ class LoRAManager:
        )
        if incompatible:
            raise ValueError(
-                f"LoRA adapter {lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration."
-                "We are still working on supporting dynamically updating LoRA shapes. If you expect to use adapters of different shapes, "
-                "You can specify expected configs via --max_lora_rank and --enable_lora_modules."
+                f"LoRA adapter {lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration. "
+                "Please ensure that the LoRA adapter's rank is within the configured `--max_lora_rank` and that the target modules are "
+                "included in `--enable_lora_modules`."
            )

    def unload_lora_adapter(self, lora_name: str) -> LoRAUpdateResult:
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -574,7 +574,7 @@ class TokenizerManager:
                    "The server is not configured to enable custom logit processor. "
                    "Please set `--enable-custom-logits-processor` to enable this feature."
                )
-            if self.server_args.lora_paths and obj.lora_path:
+            if self.server_args.enable_lora and obj.lora_path:
                self._validate_lora_adapters(obj)

    def _validate_input_ids_in_vocab(
@@ -1037,6 +1037,10 @@ class TokenizerManager:
        _: Optional[fastapi.Request] = None,
    ) -> LoadLoRAAdapterReqOutput:
        self.auto_create_handle_loop()
+        if not self.server_args.enable_lora:
+            raise ValueError(
+                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+            )

        # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
        # with dp_size > 1.
@@ -1060,6 +1064,10 @@ class TokenizerManager:
        _: Optional[fastapi.Request] = None,
    ) -> UnloadLoRAAdapterReqOutput:
        self.auto_create_handle_loop()
+        if not self.server_args.enable_lora:
+            raise ValueError(
+                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+            )

        # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
        # with dp_size > 1.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -264,7 +264,7 @@ class CudaGraphRunner:
        if self.enable_torch_compile:
            set_torch_compile_config()

-        if self.model_runner.server_args.lora_paths is not None:
+        if self.model_runner.server_args.enable_lora:
            self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)

        # Graph inputs
@@ -510,11 +510,10 @@ class CudaGraphRunner:
                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
            )

-        if self.model_runner.server_args.lora_paths is not None:
-            # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
-            # different logic to handle lora, so we need to set `lora_paths` to a list of non-None
-            # values if lora is enabled.
-            lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
+        if self.model_runner.server_args.enable_lora:
+            # It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
+            lora_paths = [None] * bs
        else:
            lora_paths = None

--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -418,7 +418,7 @@ class ForwardBatch:
            ret._compute_mrope_positions(model_runner, batch)

        # Init lora information
-        if model_runner.server_args.lora_paths is not None:
+        if model_runner.server_args.enable_lora:
            model_runner.lora_manager.prepare_lora_batch(ret)

        TboForwardBatchPreparer.prepare(
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -304,11 +304,7 @@ class ModelRunner:
            self.apply_torch_tp()

        # Init lora
-        # TODO (lifuhuang): when we support dynamic LoRA loading / unloading, we should add
-        # a new server arg `enable_lora` to control whether to init LoRA manager to be more
-        # explicit, as it is perfectly valid to start a server with an empty lora_paths and
-        # load LoRA adapters dynamically later.
-        if server_args.lora_paths is not None:
+        if server_args.enable_lora:
            self.init_lora_manager()

        # Init memory pool and attention backends
@@ -895,7 +891,7 @@ class ModelRunner:
            max_lora_rank=self.server_args.max_lora_rank,
            target_modules=self.server_args.lora_target_modules,
        )
-        result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths)
+        result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
        if result.success:
            logger.info(
                f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -26,6 +26,8 @@ from typing import List, Literal, Optional, Union
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
+    LORA_TARGET_ALL_MODULES,
+    SUPPORTED_LORA_TARGET_MODULES,
    configure_ipv6,
    get_device,
    get_device_memory_capacity,
@@ -140,8 +142,9 @@ class ServerArgs:
    preferred_sampling_params: Optional[str] = None

    # LoRA
+    enable_lora: Optional[bool] = None
    max_lora_rank: Optional[int] = None
-    lora_target_modules: Optional[List[str]] = None
+    lora_target_modules: Optional[Union[set[str], List[str]]] = None
    lora_paths: Optional[Union[dict[str, str], List[str]]] = None
    max_loras_per_batch: int = 8
    lora_backend: str = "triton"
@@ -1148,6 +1151,12 @@ class ServerArgs:
        )

        # LoRA
+        parser.add_argument(
+            "--enable-lora",
+            default=ServerArgs.enable_lora,
+            action="store_true",
+            help="Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.",
+        )
        parser.add_argument(
            "--max-lora-rank",
            default=ServerArgs.max_lora_rank,
@@ -1157,18 +1166,12 @@ class ServerArgs:
        parser.add_argument(
            "--lora-target-modules",
            type=str,
-            choices=[
-                "q_proj",
-                "k_proj",
-                "v_proj",
-                "o_proj",
-                "gate_proj",
-                "up_proj",
-                "down_proj",
-            ],
+            choices=SUPPORTED_LORA_TARGET_MODULES + [LORA_TARGET_ALL_MODULES],
            nargs="*",
            default=None,
-            help="The union set of all target modules where LoRA should be applied. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
+            help="The union set of all target modules where LoRA should be applied. If not specified, "
+            "it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, "
+            "all supported modules will be targeted.",
        )
        parser.add_argument(
            "--lora-paths",
@@ -1816,15 +1819,46 @@ class ServerArgs:
            None,
        }, "moe_dense_tp_size only support 1 and None currently"

-        if isinstance(self.lora_paths, list):
-            lora_paths = self.lora_paths
-            self.lora_paths = {}
-            for lora_path in lora_paths:
-                if "=" in lora_path:
-                    name, path = lora_path.split("=", 1)
-                    self.lora_paths[name] = path
-                else:
-                    self.lora_paths[lora_path] = lora_path
+        self.check_lora_server_args()
+
+    def check_lora_server_args(self):
+        # Enable LoRA if any LoRA paths are provided for backward compatibility.
+        if self.lora_paths:
+            if self.enable_lora is None:
+                self.enable_lora = True
+                logger.info(
+                    "--enable-lora is set to True because --lora-paths is provided."
+                )
+            elif self.enable_lora is False:
+                logger.warning(
+                    "--enable-lora is set to False, any provided lora_paths will be ignored."
+                )
+
+        if self.enable_lora:
+            # Normalize lora_paths to a dictionary if it is a list.
+            if isinstance(self.lora_paths, list):
+                lora_paths = self.lora_paths
+                self.lora_paths = {}
+                for lora_path in lora_paths:
+                    if "=" in lora_path:
+                        name, path = lora_path.split("=", 1)
+                        self.lora_paths[name] = path
+                    else:
+                        self.lora_paths[lora_path] = lora_path
+
+            # Expand target modules
+            if self.lora_target_modules:
+                self.lora_target_modules = set(self.lora_target_modules)
+                if "all" in self.lora_target_modules:
+                    assert (
+                        len(self.lora_target_modules) == 1
+                    ), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
+                    self.lora_target_modules = set(SUPPORTED_LORA_TARGET_MODULES)
+
+            # Ensure sufficient information is provided for LoRA initialization.
+            assert self.lora_paths or (
+                self.max_lora_rank and self.lora_target_modules
+            ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."

    def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
        larger_tp = max(decode_tp, prefill_tp)
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -2892,3 +2892,17 @@ def parse_module_path(module_path, function_name, create_dummy):
        return final_module, getattr(final_module, function_name)

    return final_module, None
+
+
+# LoRA-related constants and utilities
+SUPPORTED_LORA_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+
+LORA_TARGET_ALL_MODULES = "all"
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -507,6 +507,7 @@ class SRTRunner:
        sleep_on_idle=False,
        max_lora_rank: Optional[int] = None,
        lora_target_modules: Optional[List[str]] = None,
+        enable_lora: Optional[bool] = None,
    ):
        self.model_type = model_type
        self.is_generation = model_type == "generation"
@@ -547,6 +548,7 @@ class SRTRunner:
            sleep_on_idle=sleep_on_idle,
            max_lora_rank=max_lora_rank,
            lora_target_modules=lora_target_modules,
+            enable_lora=enable_lora,
            **spec_kwargs,
        )