Support start up LoRA server without initial adapters (#8019)
This commit is contained in:
@@ -186,9 +186,9 @@ class LoRAManager:
|
||||
)
|
||||
if incompatible:
|
||||
raise ValueError(
|
||||
f"LoRA adapter {lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration."
|
||||
"We are still working on supporting dynamically updating LoRA shapes. If you expect to use adapters of different shapes, "
|
||||
"You can specify expected configs via --max_lora_rank and --enable_lora_modules."
|
||||
f"LoRA adapter {lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration. "
|
||||
"Please ensure that the LoRA adapter's rank is within the configured `--max_lora_rank` and that the target modules are "
|
||||
"included in `--enable_lora_modules`."
|
||||
)
|
||||
|
||||
def unload_lora_adapter(self, lora_name: str) -> LoRAUpdateResult:
|
||||
|
||||
@@ -574,7 +574,7 @@ class TokenizerManager:
|
||||
"The server is not configured to enable custom logit processor. "
|
||||
"Please set `--enable-custom-logits-processor` to enable this feature."
|
||||
)
|
||||
if self.server_args.lora_paths and obj.lora_path:
|
||||
if self.server_args.enable_lora and obj.lora_path:
|
||||
self._validate_lora_adapters(obj)
|
||||
|
||||
def _validate_input_ids_in_vocab(
|
||||
@@ -1037,6 +1037,10 @@ class TokenizerManager:
|
||||
_: Optional[fastapi.Request] = None,
|
||||
) -> LoadLoRAAdapterReqOutput:
|
||||
self.auto_create_handle_loop()
|
||||
if not self.server_args.enable_lora:
|
||||
raise ValueError(
|
||||
"LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
|
||||
)
|
||||
|
||||
# TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
|
||||
# with dp_size > 1.
|
||||
@@ -1060,6 +1064,10 @@ class TokenizerManager:
|
||||
_: Optional[fastapi.Request] = None,
|
||||
) -> UnloadLoRAAdapterReqOutput:
|
||||
self.auto_create_handle_loop()
|
||||
if not self.server_args.enable_lora:
|
||||
raise ValueError(
|
||||
"LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
|
||||
)
|
||||
|
||||
# TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
|
||||
# with dp_size > 1.
|
||||
|
||||
@@ -264,7 +264,7 @@ class CudaGraphRunner:
|
||||
if self.enable_torch_compile:
|
||||
set_torch_compile_config()
|
||||
|
||||
if self.model_runner.server_args.lora_paths is not None:
|
||||
if self.model_runner.server_args.enable_lora:
|
||||
self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)
|
||||
|
||||
# Graph inputs
|
||||
@@ -510,11 +510,10 @@ class CudaGraphRunner:
|
||||
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
|
||||
)
|
||||
|
||||
if self.model_runner.server_args.lora_paths is not None:
|
||||
# Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
|
||||
# different logic to handle lora, so we need to set `lora_paths` to a list of non-None
|
||||
# values if lora is enabled.
|
||||
lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
|
||||
if self.model_runner.server_args.enable_lora:
|
||||
# It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
|
||||
# `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
|
||||
lora_paths = [None] * bs
|
||||
else:
|
||||
lora_paths = None
|
||||
|
||||
|
||||
@@ -418,7 +418,7 @@ class ForwardBatch:
|
||||
ret._compute_mrope_positions(model_runner, batch)
|
||||
|
||||
# Init lora information
|
||||
if model_runner.server_args.lora_paths is not None:
|
||||
if model_runner.server_args.enable_lora:
|
||||
model_runner.lora_manager.prepare_lora_batch(ret)
|
||||
|
||||
TboForwardBatchPreparer.prepare(
|
||||
|
||||
@@ -304,11 +304,7 @@ class ModelRunner:
|
||||
self.apply_torch_tp()
|
||||
|
||||
# Init lora
|
||||
# TODO (lifuhuang): when we support dynamic LoRA loading / unloading, we should add
|
||||
# a new server arg `enable_lora` to control whether to init LoRA manager to be more
|
||||
# explicit, as it is perfectly valid to start a server with an empty lora_paths and
|
||||
# load LoRA adapters dynamically later.
|
||||
if server_args.lora_paths is not None:
|
||||
if server_args.enable_lora:
|
||||
self.init_lora_manager()
|
||||
|
||||
# Init memory pool and attention backends
|
||||
@@ -895,7 +891,7 @@ class ModelRunner:
|
||||
max_lora_rank=self.server_args.max_lora_rank,
|
||||
target_modules=self.server_args.lora_target_modules,
|
||||
)
|
||||
result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths)
|
||||
result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
|
||||
if result.success:
|
||||
logger.info(
|
||||
f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
|
||||
|
||||
@@ -26,6 +26,8 @@ from typing import List, Literal, Optional, Union
|
||||
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
||||
from sglang.srt.reasoning_parser import ReasoningParser
|
||||
from sglang.srt.utils import (
|
||||
LORA_TARGET_ALL_MODULES,
|
||||
SUPPORTED_LORA_TARGET_MODULES,
|
||||
configure_ipv6,
|
||||
get_device,
|
||||
get_device_memory_capacity,
|
||||
@@ -140,8 +142,9 @@ class ServerArgs:
|
||||
preferred_sampling_params: Optional[str] = None
|
||||
|
||||
# LoRA
|
||||
enable_lora: Optional[bool] = None
|
||||
max_lora_rank: Optional[int] = None
|
||||
lora_target_modules: Optional[List[str]] = None
|
||||
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
||||
lora_paths: Optional[Union[dict[str, str], List[str]]] = None
|
||||
max_loras_per_batch: int = 8
|
||||
lora_backend: str = "triton"
|
||||
@@ -1148,6 +1151,12 @@ class ServerArgs:
|
||||
)
|
||||
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
"--enable-lora",
|
||||
default=ServerArgs.enable_lora,
|
||||
action="store_true",
|
||||
help="Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-lora-rank",
|
||||
default=ServerArgs.max_lora_rank,
|
||||
@@ -1157,18 +1166,12 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--lora-target-modules",
|
||||
type=str,
|
||||
choices=[
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
],
|
||||
choices=SUPPORTED_LORA_TARGET_MODULES + [LORA_TARGET_ALL_MODULES],
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="The union set of all target modules where LoRA should be applied. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
|
||||
help="The union set of all target modules where LoRA should be applied. If not specified, "
|
||||
"it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, "
|
||||
"all supported modules will be targeted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lora-paths",
|
||||
@@ -1816,15 +1819,46 @@ class ServerArgs:
|
||||
None,
|
||||
}, "moe_dense_tp_size only support 1 and None currently"
|
||||
|
||||
if isinstance(self.lora_paths, list):
|
||||
lora_paths = self.lora_paths
|
||||
self.lora_paths = {}
|
||||
for lora_path in lora_paths:
|
||||
if "=" in lora_path:
|
||||
name, path = lora_path.split("=", 1)
|
||||
self.lora_paths[name] = path
|
||||
else:
|
||||
self.lora_paths[lora_path] = lora_path
|
||||
self.check_lora_server_args()
|
||||
|
||||
def check_lora_server_args(self):
|
||||
# Enable LoRA if any LoRA paths are provided for backward compatibility.
|
||||
if self.lora_paths:
|
||||
if self.enable_lora is None:
|
||||
self.enable_lora = True
|
||||
logger.info(
|
||||
"--enable-lora is set to True because --lora-paths is provided."
|
||||
)
|
||||
elif self.enable_lora is False:
|
||||
logger.warning(
|
||||
"--enable-lora is set to False, any provided lora_paths will be ignored."
|
||||
)
|
||||
|
||||
if self.enable_lora:
|
||||
# Normalize lora_paths to a dictionary if it is a list.
|
||||
if isinstance(self.lora_paths, list):
|
||||
lora_paths = self.lora_paths
|
||||
self.lora_paths = {}
|
||||
for lora_path in lora_paths:
|
||||
if "=" in lora_path:
|
||||
name, path = lora_path.split("=", 1)
|
||||
self.lora_paths[name] = path
|
||||
else:
|
||||
self.lora_paths[lora_path] = lora_path
|
||||
|
||||
# Expand target modules
|
||||
if self.lora_target_modules:
|
||||
self.lora_target_modules = set(self.lora_target_modules)
|
||||
if "all" in self.lora_target_modules:
|
||||
assert (
|
||||
len(self.lora_target_modules) == 1
|
||||
), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
|
||||
self.lora_target_modules = set(SUPPORTED_LORA_TARGET_MODULES)
|
||||
|
||||
# Ensure sufficient information is provided for LoRA initialization.
|
||||
assert self.lora_paths or (
|
||||
self.max_lora_rank and self.lora_target_modules
|
||||
), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
|
||||
|
||||
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
||||
larger_tp = max(decode_tp, prefill_tp)
|
||||
|
||||
@@ -2892,3 +2892,17 @@ def parse_module_path(module_path, function_name, create_dummy):
|
||||
return final_module, getattr(final_module, function_name)
|
||||
|
||||
return final_module, None
|
||||
|
||||
|
||||
# LoRA-related constants and utilities
|
||||
SUPPORTED_LORA_TARGET_MODULES = [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
]
|
||||
|
||||
LORA_TARGET_ALL_MODULES = "all"
|
||||
|
||||
@@ -507,6 +507,7 @@ class SRTRunner:
|
||||
sleep_on_idle=False,
|
||||
max_lora_rank: Optional[int] = None,
|
||||
lora_target_modules: Optional[List[str]] = None,
|
||||
enable_lora: Optional[bool] = None,
|
||||
):
|
||||
self.model_type = model_type
|
||||
self.is_generation = model_type == "generation"
|
||||
@@ -547,6 +548,7 @@ class SRTRunner:
|
||||
sleep_on_idle=sleep_on_idle,
|
||||
max_lora_rank=max_lora_rank,
|
||||
lora_target_modules=lora_target_modules,
|
||||
enable_lora=enable_lora,
|
||||
**spec_kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user