[2/2] Introduce Chunked-SGMV kernels and corresponding LoRA backend for improved performance (#10286)

This commit is contained in:
Lifu Huang
2025-09-15 16:04:03 -07:00
committed by GitHub
parent 2689f0bf02
commit 3f41b48c40
10 changed files with 1499 additions and 13 deletions

View File

@@ -110,6 +110,8 @@ ATTENTION_BACKEND_CHOICES = [
"ascend",
]
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
@@ -1601,7 +1603,8 @@ class ServerArgs:
parser.add_argument(
"--lora-backend",
type=str,
default="triton",
choices=LORA_BACKEND_CHOICES,
default=ServerArgs.lora_backend,
help="Choose the kernel backend for multi-LoRA serving.",
)