[Feature] Define backends and add Triton backend for Lora (#3161)

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
This commit is contained in:
Baizhou Zhang
2025-02-03 22:09:13 -08:00
committed by GitHub
parent 7b5a374114
commit 70817a7eae
18 changed files with 1129 additions and 135 deletions

View File

@@ -113,6 +113,7 @@ class ServerArgs:
# LoRA
lora_paths: Optional[List[str]] = None
max_loras_per_batch: int = 8
lora_backend: str = "triton"
# Kernel backend
attention_backend: Optional[str] = None
@@ -653,13 +654,19 @@ class ServerArgs:
nargs="*",
default=None,
action=LoRAPathAction,
help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
)
parser.add_argument(
"--max-loras-per-batch",
type=int,
default=8,
help="Maximum number of adapters for a running batch, include base-only request",
help="Maximum number of adapters for a running batch, include base-only request.",
)
parser.add_argument(
"--lora-backend",
type=str,
default="triton",
help="Choose the kernel backend for multi-LoRA serving.",
)
# Kernel backend