[Feature] Initial support for multi-LoRA serving (#1307)

This commit is contained in:
Ying Sheng
2024-09-12 16:46:14 -07:00
committed by GitHub
parent c33d82a211
commit 712216928f
21 changed files with 1435 additions and 22 deletions

View File

@@ -35,6 +35,7 @@ import torch
import torch.distributed as dist
from fastapi.responses import JSONResponse
from packaging import version as pkg_version
from torch import nn
from torch.nn.parameter import Parameter
from triton.runtime.cache import (
FileCacheManager,
@@ -714,3 +715,14 @@ def configure_logger(server_args, prefix: str = ""):
datefmt="%H:%M:%S",
force=True,
)
# source: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/vllm/lora/utils.py#L9
def replace_submodule(
model: nn.Module, module_name: str, new_module: nn.Module
) -> nn.Module:
"""Replace a submodule in a model with a new module."""
parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
target_name = module_name.split(".")[-1]
setattr(parent, target_name, new_module)
return new_module