[2/2] Introduce Chunked-SGMV kernels and corresponding LoRA backend for improved performance (#10286)

2025-09-15 16:04:03 -07:00
parent 2689f0bf02
commit 3f41b48c40
10 changed files with 1499 additions and 13 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -110,6 +110,8 @@ ATTENTION_BACKEND_CHOICES = [
    "ascend",
 ]

+LORA_BACKEND_CHOICES = ["triton", "csgmv"]
+
 DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]

 GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
@@ -1601,7 +1603,8 @@ class ServerArgs:
        parser.add_argument(
            "--lora-backend",
            type=str,
-            default="triton",
+            choices=LORA_BACKEND_CHOICES,
+            default=ServerArgs.lora_backend,
            help="Choose the kernel backend for multi-LoRA serving.",
        )