[2/2] Introduce Chunked-SGMV kernels and corresponding LoRA backend for improved performance (#10286)

2025-09-15 16:04:03 -07:00
parent 2689f0bf02
commit 3f41b48c40
10 changed files with 1499 additions and 13 deletions
--- a/python/sglang/srt/lora/lora.py
+++ b/python/sglang/srt/lora/lora.py
@@ -28,14 +28,15 @@ from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend
-
-# from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
+from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader

 logger = logging.getLogger(__name__)

+SUPPORTED_BACKENDS = (TritonLoRABackend, ChunkedSgmvLoRABackend)
+

 class LoRALayer(nn.Module):
    def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
@@ -48,6 +49,7 @@ class LoRALayer(nn.Module):


 class LoRAAdapter(nn.Module):
+
    def __init__(
        self,
        uid: str,
@@ -159,8 +161,8 @@ class LoRAAdapter(nn.Module):
                gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
                if up_name not in weights:
                    weights[up_name] = torch.zeros_like(weights[weight_name])
-                    assert isinstance(self.lora_backend, TritonLoRABackend), (
-                        f"LoRA weight initialization currently only supported for 'triton' backend. "
+                    assert isinstance(self.lora_backend, SUPPORTED_BACKENDS), (
+                        f"LoRA weight initialization currently only supported for LoRA backends: {', '.join(b.name for b in SUPPORTED_BACKENDS)}"
                        f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
                        f"or consider implementing custom initialization logic for other backends."
                    )