[2/2] Introduce Chunked-SGMV kernels and corresponding LoRA backend for improved performance (#10286)

This commit is contained in:
Lifu Huang
2025-09-15 16:04:03 -07:00
committed by GitHub
parent 2689f0bf02
commit 3f41b48c40
10 changed files with 1499 additions and 13 deletions

View File

@@ -28,14 +28,15 @@ from torch import nn
from sglang.srt.configs.load_config import LoadConfig
from sglang.srt.hf_transformers_utils import AutoConfig
from sglang.srt.lora.backend.base_backend import BaseLoRABackend
# from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
from sglang.srt.lora.lora_config import LoRAConfig
from sglang.srt.model_loader.loader import DefaultModelLoader
logger = logging.getLogger(__name__)
SUPPORTED_BACKENDS = (TritonLoRABackend, ChunkedSgmvLoRABackend)
class LoRALayer(nn.Module):
def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
@@ -48,6 +49,7 @@ class LoRALayer(nn.Module):
class LoRAAdapter(nn.Module):
def __init__(
self,
uid: str,
@@ -159,8 +161,8 @@ class LoRAAdapter(nn.Module):
gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
if up_name not in weights:
weights[up_name] = torch.zeros_like(weights[weight_name])
assert isinstance(self.lora_backend, TritonLoRABackend), (
f"LoRA weight initialization currently only supported for 'triton' backend. "
assert isinstance(self.lora_backend, SUPPORTED_BACKENDS), (
f"LoRA weight initialization currently only supported for LoRA backends: {', '.join(b.name for b in SUPPORTED_BACKENDS)}"
f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
f"or consider implementing custom initialization logic for other backends."
)