[2/2] Introduce Chunked-SGMV kernels and corresponding LoRA backend for improved performance (#10286)

2025-09-15 16:04:03 -07:00
parent 2689f0bf02
commit 3f41b48c40
10 changed files with 1499 additions and 13 deletions
--- a/docs/advanced_features/lora.ipynb
+++ b/docs/advanced_features/lora.ipynb
@@ -35,7 +35,7 @@
    "\n",
    "* `max_loaded_loras`: If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `max-loras-per-batch`.\n",
    "\n",
-    "* `lora_backend`: The backend of running GEMM kernels for Lora modules. Currently we only support Triton LoRA backend. In the future, faster backend built upon Cutlass or Cuda kernels will be added.\n",
+    "* `lora_backend`: The backend of running GEMM kernels for Lora modules. Currently we support Triton LoRA backend (`triton`) and Chunked SGMV backend (`csgmv`). In the future, faster backend built upon Cutlass or Cuda kernels will be added.\n",
    "\n",
    "* `max_lora_rank`: The maximum LoRA rank that should be supported. If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of larger LoRA rank after server startup.\n",
    "\n",
@@ -79,7 +79,7 @@
    "python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "    --enable-lora \\\n",
    "    --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
-    "    --max-loras-per-batch 1 --lora-backend triton \\\n",
+    "    --max-loras-per-batch 1 \\\n",
    "    --log-level warning \\\n",
    "\"\"\"\n",
    ")\n",
@@ -139,7 +139,7 @@
    "    --enable-lora \\\n",
    "    --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
    "    lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
-    "    --max-loras-per-batch 2 --lora-backend triton \\\n",
+    "    --max-loras-per-batch 2 \\\n",
    "    --log-level warning \\\n",
    "\"\"\"\n",
    ")\n",
@@ -214,7 +214,7 @@
    "    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "    --enable-lora \\\n",
    "    --cuda-graph-max-bs 2 \\\n",
-    "    --max-loras-per-batch 2 --lora-backend triton \\\n",
+    "    --max-loras-per-batch 2 \\\n",
    "    --max-lora-rank 256\n",
    "    --lora-target-modules all\n",
    "    --log-level warning\n",
@@ -413,7 +413,7 @@
    "    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "    --enable-lora \\\n",
    "    --cuda-graph-max-bs 8 \\\n",
-    "    --max-loras-per-batch 3 --lora-backend triton \\\n",
+    "    --max-loras-per-batch 3 \\\n",
    "    --max-lora-rank 256 \\\n",
    "    --lora-target-modules all \\\n",
    "    --lora-paths \\\n",
@@ -501,6 +501,48 @@
    "terminate_process(server_process)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Choosing LoRA Backend\n",
    "\n",
    "SGLang supports two LoRA backends that you can choose from using the `--lora-backend` argument:\n",
    "\n",
    "- `triton`: Default basic Triton-based backend.\n",
    "- `csgmv`: Chunked SGMV backend optimized for high concurrency scenarios.\n",
    "\n",
    "The `csgmv` backend was recently introduced to improve performance especially at high-concurrency scenarios. Our benchmark shows that it achieves 20% to 80% latency improvements over the basic triton backend.\n",
    "Currently it is at preview phase, we expect to make it our the default LoRA backend in future release. Before that, you can adopt it by manually setting the `--lora-backend` server config."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "    python3 -m sglang.launch_server \\\n",
    "    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "    --enable-lora \\\n",
    "    --lora-backend csgmv \\\n",
    "    --max-loras-per-batch 16 \\\n",
    "    --lora-paths lora1=path/to/lora1 lora2=path/to/lora2\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "terminate_process(server_process)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/python/sglang/srt/lora/backend/base_backend.py
+++ b/python/sglang/srt/lora/backend/base_backend.py
@@ -143,10 +143,10 @@ def get_backend_from_name(name: str) -> BaseLoRABackend:
        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
        return TritonLoRABackend
-    # elif name == "csgmv":
+    elif name == "csgmv":
-    #     from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
+        from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
-    #     return ChunkedSgmvLoRABackend
+        return ChunkedSgmvLoRABackend
    elif name == "flashinfer":
        raise ValueError(
            "FlashInfer LoRA backend has been deprecated, please use `triton` instead."
--- a/python/sglang/srt/lora/backend/chunked_backend.py
+++ b/python/sglang/srt/lora/backend/chunked_backend.py
@@ -0,0 +1,306 @@
 from typing import Optional
 import torch
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.triton_ops import (
    chunked_sgmv_lora_expand_forward,
    chunked_sgmv_lora_shrink_forward,
 )
 from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 class ChunkedSgmvLoRABackend(BaseLoRABackend):
    """
    Chunked LoRA backend using segmented matrix-vector multiplication.
    This backend is largely based on the SGMV (Segmented Gather Matrix-Vector multiplication) algorithm
    introduced in the Punica paper (https://arxiv.org/pdf/2310.18547). One main variation made here is to
    segment the input sequences into fixed-size chunks, which reduces excessive kernel launches especially
    when the LoRA distribution is skewed.
    """
    name = "csgmv"
    def __init__(self, max_loras_per_batch: int, device: torch.device):
        super().__init__(max_loras_per_batch, device)
        self.segment_size = 16  # TODO (lifuhuang): make it configurable?
    def run_lora_a_sgemm(
        self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
    ) -> torch.Tensor:
        return chunked_sgmv_lora_shrink_forward(
            x,
            weights,
            self.batch_info,
        )
    def run_lora_b_sgemm(
        self,
        x: torch.Tensor,
        weights: torch.Tensor,
        output_offset: torch.Tensor,
        base_output: torch.Tensor = None,
        *args,
        **kwargs
    ) -> torch.Tensor:
        # For simple lora B, we use slice offsets [0, output_dim]
        output_dim = weights.shape[-2]
        max_slice_size = output_dim
        return chunked_sgmv_lora_expand_forward(
            x=x,
            lora_weight_b=weights,
            batch_info=self.batch_info,
            slice_offsets=output_offset,
            max_slice_size=max_slice_size,
            base_output=base_output,
        )
    def run_qkv_lora(
        self,
        x: torch.Tensor,
        qkv_lora_a: torch.Tensor,
        qkv_lora_b: torch.Tensor,
        output_offset: torch.Tensor,
        max_qkv_out_dim: int,
        base_output: torch.Tensor = None,
        *args,
        **kwargs
    ) -> torch.Tensor:
        # x: (s, input_dim)
        # qkv_lora_a: (num_lora, 3 * r, input_dim)
        # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
        assert isinstance(qkv_lora_b, torch.Tensor)
        lora_a_output = chunked_sgmv_lora_shrink_forward(
            x,
            qkv_lora_a,
            self.batch_info,
            num_slices=3,
        )
        lora_output = chunked_sgmv_lora_expand_forward(
            x=lora_a_output,
            lora_weight_b=qkv_lora_b,
            batch_info=self.batch_info,
            slice_offsets=output_offset,
            max_slice_size=max_qkv_out_dim,
            base_output=base_output,
        )
        return lora_output
    def run_gate_up_lora(
        self,
        x: torch.Tensor,
        gate_up_lora_a: torch.Tensor,
        gate_up_lora_b: torch.Tensor,
        output_offset: torch.Tensor,
        base_output: torch.Tensor = None,
        *args,
        **kwargs
    ) -> torch.Tensor:
        # x: (s, input_dim)
        # gate_up_lora_a: (num_lora, 2 * r, input_dim)
        # gate_up_lora_b: (num_lora, 2 * output_dim, r)
        assert isinstance(gate_up_lora_b, torch.Tensor)
        output_dim = gate_up_lora_b.shape[-2] // 2
        # lora_a_output: (s, 2 * r)
        lora_a_output = chunked_sgmv_lora_shrink_forward(
            x,
            gate_up_lora_a,
            self.batch_info,
            num_slices=2,
        )
        lora_output = chunked_sgmv_lora_expand_forward(
            x=lora_a_output,
            lora_weight_b=gate_up_lora_b,
            batch_info=self.batch_info,
            slice_offsets=output_offset,
            max_slice_size=output_dim,
            base_output=base_output,
        )
        return lora_output
    def prepare_lora_batch(
        self,
        forward_batch: ForwardBatch,
        weight_indices: list[int],
        lora_ranks: list[int],
        scalings: list[float],
        batch_info: Optional[LoRABatchInfo] = None,
    ):
        permutation, weight_indices_reordered = ChunkedSgmvLoRABackend._get_permutation(
            weight_indices, forward_batch
        )
        seg_weight_indices, seg_indptr = self._get_segments_info(
            weight_indices_reordered
        )
        num_segments = len(seg_weight_indices)
        lora_ranks_tensor = torch.tensor(
            lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
        )
        scalings_tensor = torch.tensor(
            scalings, dtype=torch.float, pin_memory=True, device="cpu"
        )
        if batch_info is None:
            batch_info = LoRABatchInfo(
                bs=forward_batch.batch_size,
                num_segments=num_segments,
                use_cuda_graph=False,
                seg_indptr=torch.empty(
                    (num_segments + 1,), dtype=torch.int32, device=self.device
                ),
                weight_indices=torch.empty(
                    (num_segments,), dtype=torch.int32, device=self.device
                ),
                lora_ranks=torch.empty(
                    (self.max_loras_per_batch,), dtype=torch.int32, device=self.device
                ),
                scalings=torch.empty(
                    (self.max_loras_per_batch,), dtype=torch.float, device=self.device
                ),
                permutation=torch.empty(
                    (len(permutation),), dtype=torch.int32, device=self.device
                ),
                # Not used in chunked kernels
                max_len=None,
                seg_lens=None,
            )
        else:
            batch_info.bs = forward_batch.batch_size
            batch_info.num_segments = num_segments
        # Copy to device asynchronously
        batch_info.lora_ranks[: self.max_loras_per_batch].copy_(
            lora_ranks_tensor, non_blocking=True
        )
        batch_info.scalings[: self.max_loras_per_batch].copy_(
            scalings_tensor, non_blocking=True
        )
        batch_info.weight_indices[:num_segments].copy_(
            seg_weight_indices, non_blocking=True
        )
        batch_info.seg_indptr[: num_segments + 1].copy_(seg_indptr, non_blocking=True)
        batch_info.permutation[: len(permutation)].copy_(permutation, non_blocking=True)
        self.batch_info = batch_info
    @staticmethod
    def _get_permutation(seq_weight_indices, forward_batch: ForwardBatch):
        """
        Computes permutation indices for reordering tokens by their LoRA adapter assignments.
        This function implements the "gather" step in Chunked Segmented Gather Matrix Vector
        multiplication by creating a permutation that groups tokens by their LoRA adapter.
        Tokens using the same LoRA adapter are placed together to enable efficient batched
        computation.
        Example:
            seq_weight_indices = [0, 1, 0]  # 3 sequences using adapters [0, 1, 0]
            extend_seq_lens = [2, 1, 3]     # sequence lengths [2, 1, 3 tokens]
            # Creates row_weight_indices: [0, 0, 1, 0, 0, 0] (6 tokens total)
            # Returns permutation: [0, 1, 3, 4, 5, 2] (groups adapter 0 tokens together)
            # weights_reordered: [0, 0, 0, 0, 0, 1] (sorted by adapter)
        Args:
            seq_weight_indices: List of LoRA adapter indices for each sequence
            forward_batch (ForwardBatch): Batch information containing sequence lengths
        Returns:
            tuple: (permutation, weights_reordered) where:
                - permutation: Token reordering indices to group by adapter
                - weights_reordered: Sorted adapter indices for each token
        """
        with torch.device("cpu"):
            seq_weight_indices = torch.tensor(seq_weight_indices, dtype=torch.int32)
            seg_lens_cpu = (
                torch.tensor(
                    forward_batch.extend_seq_lens_cpu,
                    dtype=torch.int32,
                )
                if forward_batch.forward_mode.is_extend()
                else torch.ones(forward_batch.batch_size, dtype=torch.int32)
            )
            row_weight_indices = torch.repeat_interleave(
                seq_weight_indices, seg_lens_cpu
            )
            permutation = torch.empty(
                (len(row_weight_indices),), dtype=torch.long, pin_memory=True
            )
            torch.argsort(row_weight_indices, stable=True, out=permutation)
            weights_reordered = row_weight_indices[permutation]
            return permutation, weights_reordered
    def _get_segments_info(self, weights_reordered: torch.Tensor):
        """
        Computes segment information for chunked SGMV operations.
        This function takes the reordered weight indices and creates segments of fixed size
        (self.segment_size) for efficient kernel execution. Each segment contains tokens
        that use the same LoRA adapter, enabling vectorized computation.
        The segmentation is necessary because:
        1. GPU kernels work efficiently on fixed-size blocks
        2. Large groups of tokens using the same adapter are split into manageable chunks
        3. Each segment can be processed independently in parallel
        Example:
            weights_reordered = [0, 0, 0, 0, 0, 1]  # 5 tokens with adapter 0, 1 with adapter 1
            segment_size = 3
            # Creates segments:
            # Segment 0: tokens 0-2 (adapter 0), length=3
            # Segment 1: tokens 3-4 (adapter 0), length=2
            # Segment 2: token 5 (adapter 1), length=1
            # Returns:
            # weight_indices_list: [0, 0, 1] (adapter for each segment)
            # seg_indptr: [0, 3, 5, 6] (cumulative segment boundaries)
        Args:
            weights_reordered (torch.Tensor): Sorted adapter indices for each token
        Returns:
            tuple: (weight_indices_list, seg_indptr) where:
                - weight_indices_list: LoRA adapter index for each segment
                - seg_indptr: Cumulative segment boundaries (CSR-style indptr)
        """
        with torch.device("cpu"):
            unique_weights, counts = torch.unique_consecutive(
                weights_reordered, return_counts=True
            )
            weight_indices_list = []
            seg_lens_list = []
            for weight_idx, group_len in zip(unique_weights, counts):
                group_len = group_len.item()
                num_segs = (group_len + self.segment_size - 1) // self.segment_size
                weight_indices_list.extend([weight_idx.item()] * num_segs)
                seg_lens_list.extend([self.segment_size] * (num_segs - 1))
                seg_lens_list.append(group_len - (num_segs - 1) * self.segment_size)
            seg_lens = torch.tensor(seg_lens_list, dtype=torch.int32)
            weight_indices_list = torch.tensor(
                weight_indices_list, dtype=torch.int32, pin_memory=True
            )
            seg_indptr = torch.empty(
                (len(seg_lens) + 1,), dtype=torch.int32, pin_memory=True
            )
            seg_indptr[0] = 0
            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
            return weight_indices_list, seg_indptr
--- a/python/sglang/srt/lora/lora.py
+++ b/python/sglang/srt/lora/lora.py
@@ -28,14 +28,15 @@ from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend
-
+from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 # from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader
 logger = logging.getLogger(__name__)
 SUPPORTED_BACKENDS = (TritonLoRABackend, ChunkedSgmvLoRABackend)
 class LoRALayer(nn.Module):
    def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
@@ -48,6 +49,7 @@ class LoRALayer(nn.Module):
 class LoRAAdapter(nn.Module):
    def __init__(
        self,
        uid: str,
@@ -159,8 +161,8 @@ class LoRAAdapter(nn.Module):
                gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
                if up_name not in weights:
                    weights[up_name] = torch.zeros_like(weights[weight_name])
-                    assert isinstance(self.lora_backend, TritonLoRABackend), (
+                    assert isinstance(self.lora_backend, SUPPORTED_BACKENDS), (
-                        f"LoRA weight initialization currently only supported for 'triton' backend. "
+                        f"LoRA weight initialization currently only supported for LoRA backends: {', '.join(b.name for b in SUPPORTED_BACKENDS)}"
                        f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
                        f"or consider implementing custom initialization logic for other backends."
                    )
--- a/python/sglang/srt/lora/triton_ops/init.py
+++ b/python/sglang/srt/lora/triton_ops/init.py
@@ -1,3 +1,5 @@
 from .chunked_sgmv_expand import chunked_sgmv_lora_expand_forward
 from .chunked_sgmv_shrink import chunked_sgmv_lora_shrink_forward
 from .gate_up_lora_b import gate_up_lora_b_fwd
 from .qkv_lora_b import qkv_lora_b_fwd
 from .sgemm_lora_a import sgemm_lora_a_fwd
@@ -8,4 +10,6 @@ __all__ = [
    "qkv_lora_b_fwd",
    "sgemm_lora_a_fwd",
    "sgemm_lora_b_fwd",
    "chunked_sgmv_lora_shrink_forward",
    "chunked_sgmv_lora_expand_forward",
 ]
--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
@@ -0,0 +1,211 @@
 from typing import Optional
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.lora.utils import LoRABatchInfo
@triton.jit
 def _chunked_lora_expand_kernel(
    # Pointers to matrices
    x,
    weights,
    output,
    # Parameters of size
    # Strides
    x_stride_0,
    x_stride_1,
    w_stride_0,
    w_stride_1,
    w_stride_2,
    output_stride_0,
    output_stride_1,
    # Information on sequence lengths and weight id
    seg_indptr,
    weight_indices,
    lora_ranks,
    permutation,
    num_segs,
    # For fused output scaling
    scalings,
    # Offsets of q/k/v slice on output dimension
    slice_offsets,
    # Meta parameters
    NUM_SLICES: tl.constexpr,
    MAX_RANK: tl.constexpr,  # K = R
    BLOCK_S: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
 ):
    """
    Computes a chunked SGMV for LoRA expand operations.
    When a sequence's rank is 0, the kernel is essentially a no-op, following
    the convention in pytorch where the product of two matrices of shape (m, 0)
    and (0, n) is an all-zero matrix of shape (m, n).
    Args:
        x (Tensor): The input tensor, which is the result of the LoRA A projection.
            Shape: (s, num_slices * K), where s is the sum of all sequence lengths in the
            batch and K is the maximum LoRA rank.
        weights (Tensor): The LoRA B weights for all adapters.
            Shape: (num_lora, output_dim, K).
        output (Tensor): The output tensor where the result is stored.
            Shape: (s, output_dim).
    """
    tl.static_assert(NUM_SLICES <= 3)
    pid_s = tl.program_id(axis=2)
    if pid_s >= num_segs:
        return
    # Current block computes sequence with batch_id,
    # which starts from row seg_start of x with length seg_len.
    # qkv_id decides which of q,k,v to compute (0: q, 1: k, 2: v)
    w_index = tl.load(weight_indices + pid_s)
    cur_rank = tl.load(lora_ranks + w_index)
    # If rank is 0, this kernel is a no-op.
    if cur_rank == 0:
        return
    seg_start = tl.load(seg_indptr + pid_s)
    seg_end = tl.load(seg_indptr + pid_s + 1)
    slice_id = tl.program_id(axis=1)
    slice_start = tl.load(slice_offsets + slice_id)
    slice_end = tl.load(slice_offsets + slice_id + 1)
    scaling = tl.load(scalings + w_index)
    # Adjust K (rank) according to the specific LoRA adapter
    cur_rank = tl.minimum(MAX_RANK, cur_rank)
    # Map logical sequence index to physical index
    s_offset_logical = tl.arange(0, BLOCK_S) + seg_start
    s_offset_physical = tl.load(
        permutation + s_offset_logical, mask=s_offset_logical < seg_end
    )
    # Create pointers for the first block of x and weights[batch_id][n_start: n_end][:]
    # The pointers will be advanced as we move in the K direction
    # and accumulate
    pid_n = tl.program_id(axis=0)
    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_start
    k_offset = tl.arange(0, BLOCK_K)
    x_ptrs = (
        x
        + slice_id * cur_rank * x_stride_1
        + (s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
    )
    w_ptrs = (weights + w_index * w_stride_0) + (
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(cur_rank, BLOCK_K)):
        x_tile = tl.load(
            x_ptrs,
            mask=(s_offset_logical[:, None] < seg_end)
            & (k_offset[None, :] < cur_rank - k * BLOCK_K),
            other=0.0,
        )
        w_tile = tl.load(
            w_ptrs,
            mask=(k_offset[:, None] < cur_rank - k * BLOCK_K)
            & (n_offset[None, :] < slice_end),
            other=0.0,
        )
        partial_sum += tl.dot(x_tile, w_tile)
        x_ptrs += BLOCK_K * x_stride_1
        w_ptrs += BLOCK_K * w_stride_2
    # Store result to output matrix
    partial_sum *= scaling
    partial_sum = partial_sum.to(x.dtype.element_ty)
    output_ptr = output + (
        s_offset_physical[:, None] * output_stride_0
        + n_offset[None, :] * output_stride_1
    )
    output_mask = (s_offset_logical[:, None] < seg_end) & (
        n_offset[None, :] < slice_end
    )
    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
    tl.store(output_ptr, partial_sum, mask=output_mask)
 def chunked_sgmv_lora_expand_forward(
    x: torch.Tensor,
    lora_weight_b: torch.Tensor,
    batch_info: LoRABatchInfo,
    slice_offsets: torch.Tensor,
    max_slice_size: int,
    base_output: torch.Tensor = None,
 ) -> torch.Tensor:
    # x: (s, slice_num * r)
    # lora_weight_b: (num_lora, output_dim, r)
    # slice_offsets: boundaries for different slices in the output dimension
    # output: (s, output_dim)
    # Compute lora_output with shape (s, output_dim) as follows:
    # For each slice i, accumulates:
    # lora_output[:, slice_offsets[i]:slice_offsets[i+1]] += scaling * sgemm(x[:, i*cur_rank:(i+1)*cur_rank], lora_weight_b[:, slice_offsets[i]:slice_offsets[i+1], :])
    # Get dims
    s = x.shape[0]
    input_dim = x.shape[1]
    max_lora_rank = lora_weight_b.shape[-1]
    output_dim = lora_weight_b.shape[-2]
    num_slices = len(slice_offsets) - 1
    assert input_dim == num_slices * max_lora_rank
    # TODO (lifuhuang): fine-tune per operation
    BLOCK_M = 16
    BLOCK_K = 16
    BLOCK_N = 64
    num_segments = batch_info.num_segments
    grid = (
        triton.cdiv(max_slice_size, BLOCK_N),
        num_slices,  # number of slices in the input/output
        batch_info.bs if batch_info.use_cuda_graph else num_segments,
    )
    if base_output is None:
        output = torch.zeros((s, output_dim), device=x.device, dtype=x.dtype)
    else:
        output = base_output
    _chunked_lora_expand_kernel[grid](
        x=x,
        weights=lora_weight_b,
        output=output,
        x_stride_0=x.stride(0),
        x_stride_1=x.stride(1),
        w_stride_0=lora_weight_b.stride(0),
        w_stride_1=lora_weight_b.stride(1),
        w_stride_2=lora_weight_b.stride(2),
        output_stride_0=output.stride(0),
        output_stride_1=output.stride(1),
        seg_indptr=batch_info.seg_indptr,
        weight_indices=batch_info.weight_indices,
        lora_ranks=batch_info.lora_ranks,
        permutation=batch_info.permutation,
        num_segs=num_segments,
        scalings=batch_info.scalings,
        slice_offsets=slice_offsets,
        # constants
        NUM_SLICES=num_slices,
        MAX_RANK=max_lora_rank,
        BLOCK_S=BLOCK_M,
        BLOCK_N=BLOCK_N,
        BLOCK_K=BLOCK_K,
    )
    return output
--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
@@ -0,0 +1,177 @@
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.lora.utils import LoRABatchInfo
@triton.jit
 def _chunked_lora_shrink_kernel(
    # Pointers to matrices
    x,
    weights,
    output,
    # Strides
    x_stride_0,
    x_stride_1,
    w_stride_0,
    w_stride_1,
    w_stride_2,
    output_stride_0,
    output_stride_1,
    # Information on sequence lengths,ranks and weight id
    seg_indptr,
    weight_indices,
    lora_ranks,
    permutation,
    num_segs,
    # Meta parameters
    N: tl.constexpr,  # num_slices * r
    K: tl.constexpr,  # input_dim
    NUM_SLICES: tl.constexpr,
    BLOCK_S: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
 ):
    """
    Computes a chunked SGMV for LoRA shrink operations.
    The kernel ensures that output[seg_start:seg_start + seg_len, :rank * num_slices]
    stores the product of the input `x` and the LoRA weights for the corresponding
    sequence. This implies that when rank is 0, the kernel is essentially a no-op,
    as output[seg_start:seg_start + seg_len, :0] is trivially correct (empty).
    Args:
        x (torch.Tensor): The input activations tensor of shape `(s, K)`, where `s`
            is the sum of all sequence lengths in the batch.
        weights (torch.Tensor): The LoRA A weights for all available adapters,
            with shape `(num_lora, N, K)` where N = num_slices * r.
        output (torch.Tensor): The output tensor of shape `(s, N)`.
    """
    pid_s = tl.program_id(1)
    if pid_s >= num_segs:
        return
    pid_n = tl.program_id(0)
    # Current block computes sequence with batch_id,
    # which starts from row seg_start of x with length seg_len
    w_index = tl.load(weight_indices + pid_s)
    rank = tl.load(lora_ranks + w_index)
    # If rank is 0, this kernel becomes a no-op as the output is always trivially correct.
    if rank == 0:
        return
    seg_start = tl.load(seg_indptr + pid_s)
    seg_end = tl.load(seg_indptr + pid_s + 1)
    # Adjust N dim according to the specific LoRA adapter
    cur_n = tl.minimum(N, rank * NUM_SLICES)
    # Map logical sequence index to physical index
    s_offset_logical = tl.arange(0, BLOCK_S) + seg_start
    s_offset_physical = tl.load(
        permutation + s_offset_logical, mask=s_offset_logical < seg_end
    )
    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
    k_offset = tl.arange(0, BLOCK_K)
    x_ptrs = x + (
        s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
    )
    w_ptrs = (weights + w_index * w_stride_0) + (
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(
            x_ptrs,
            mask=(s_offset_logical[:, None] < seg_end)
            & (k_offset[None, :] < K - k * BLOCK_K),
            other=0.0,
        )
        w_tile = tl.load(
            w_ptrs,
            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < cur_n),
            other=0.0,
        )
        partial_sum += tl.dot(x_tile, w_tile)
        x_ptrs += BLOCK_K * x_stride_1
        w_ptrs += BLOCK_K * w_stride_2
    # Store result to output matrix
    partial_sum = partial_sum.to(x.dtype.element_ty)
    output_ptr = output + (
        s_offset_physical[:, None] * output_stride_0
        + n_offset[None, :] * output_stride_1
    )
    output_mask = (s_offset_logical[:, None] < seg_end) & (n_offset[None, :] < cur_n)
    tl.store(output_ptr, partial_sum, mask=output_mask)
 def chunked_sgmv_lora_shrink_forward(
    x: torch.Tensor,
    weights: torch.Tensor,
    batch_info: LoRABatchInfo,
    num_slices: int = 1,
 ) -> torch.Tensor:
    # x: (s, input_dim)
    # weights: (num_lora, num_slices * r, input_dim)
    # output: (s, num_slices * r)
    # num_slices: qkv=3, gate_up=2, others=1
    # when called with multiple slices, the weights.shape[-2] will be num_slices * r
    # input_dim is much larger than r
    assert x.is_contiguous()
    assert weights.is_contiguous()
    assert len(x.shape) == 2
    assert len(weights.shape) == 3
    # Block shapes
    # TODO (lifuhuang): experiment with split-k
    BLOCK_S = 16
    BLOCK_N = 16
    BLOCK_K = 256
    S = x.shape[0]
    N = weights.shape[1]
    K = weights.shape[2]
    assert x.shape[-1] == K
    num_segments = batch_info.num_segments
    grid = (
        triton.cdiv(N, BLOCK_N),
        batch_info.bs if batch_info.use_cuda_graph else num_segments,
    )
    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
    _chunked_lora_shrink_kernel[grid](
        x=x,
        weights=weights,
        output=output,
        x_stride_0=x.stride(0),
        x_stride_1=x.stride(1),
        w_stride_0=weights.stride(0),
        w_stride_1=weights.stride(1),
        w_stride_2=weights.stride(2),
        output_stride_0=output.stride(0),
        output_stride_1=output.stride(1),
        seg_indptr=batch_info.seg_indptr,
        weight_indices=batch_info.weight_indices,
        lora_ranks=batch_info.lora_ranks,
        permutation=batch_info.permutation,
        num_segs=num_segments,
        # constants
        N=N,
        K=K,
        NUM_SLICES=num_slices,
        BLOCK_S=BLOCK_S,
        BLOCK_N=BLOCK_N,
        BLOCK_K=BLOCK_K,
    )
    return output
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -110,6 +110,8 @@ ATTENTION_BACKEND_CHOICES = [
    "ascend",
 ]
 LORA_BACKEND_CHOICES = ["triton", "csgmv"]
 DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
 GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
@@ -1601,7 +1603,8 @@ class ServerArgs:
        parser.add_argument(
            "--lora-backend",
            type=str,
-            default="triton",
+            choices=LORA_BACKEND_CHOICES,
            default=ServerArgs.lora_backend,
            help="Choose the kernel backend for multi-LoRA serving.",
        )
--- a/test/srt/lora/test_chunked_sgmv_backend.py
+++ b/test/srt/lora/test_chunked_sgmv_backend.py
@@ -0,0 +1,740 @@
 import random
 import unittest
 from enum import Enum
 from typing import Dict, List, Optional, Tuple
 import torch
 from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 from sglang.srt.lora.triton_ops import (
    chunked_sgmv_lora_expand_forward,
    chunked_sgmv_lora_shrink_forward,
 )
 from sglang.srt.lora.utils import LoRABatchInfo
 def safe_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    """Matrix multiplication with mixed precision handling for float16"""
    result = torch.matmul(a.float(), b.float())
    return result.to(a.dtype)
 class BatchComposition(Enum):
    UNIFORM = "uniform"
    MIXED = "mixed"
    SKEWED = "skewed"
    NONE = "_NO_LORA_"
 class BatchMode(Enum):
    PREFILL = "prefill"
    DECODE = "decode"
 def reference_sgmv_shrink(
    x: torch.Tensor,
    weights: torch.Tensor,
    batch_info: LoRABatchInfo,
    seq_lengths: List[int],
    lora_assignments: List[str],
    num_slices: int = 1,
 ) -> torch.Tensor:
    """
    Simple sequence-level reference implementation of SGMV shrink operation.
    Args:
        x: (total_seq_len, input_dim) - Input activations
        weights: (num_loras, num_slices * max_rank, input_dim) - LoRA A weights
        batch_info: Batch information (only used for lora_ranks)
        seq_lengths: Length of each sequence
        lora_assignments: LoRA name for each sequence
        num_slices: Number of slices (3 for QKV, 2 for gate_up, 1 for others)
    Returns:
        output: (total_seq_len, num_slices * max_rank) - Intermediate activations
    """
    if weights.numel() == 0:
        total_seq_len = x.shape[0]
        return torch.zeros(total_seq_len, 0, dtype=x.dtype, device=x.device)
    total_seq_len, input_dim = x.shape
    num_loras, weight_out_dim, _ = weights.shape
    max_rank = weight_out_dim // num_slices
    output = torch.zeros(
        total_seq_len, num_slices * max_rank, dtype=x.dtype, device=x.device
    )
    unique_loras = sorted(set(lora_assignments))
    lora_name_to_idx = {name: idx for idx, name in enumerate(unique_loras)}
    lora_ranks = batch_info.lora_ranks.cpu().numpy()
    token_offset = 0
    for seq_len, lora_name in zip(seq_lengths, lora_assignments):
        if seq_len == 0:
            continue
        lora_idx = lora_name_to_idx[lora_name]
        rank = lora_ranks[lora_idx]
        if rank > 0:
            x_seq = x[token_offset : token_offset + seq_len, :]
            w_seq = weights[lora_idx, : num_slices * rank, :]
            result = safe_matmul(x_seq, w_seq.t())
            output[token_offset : token_offset + seq_len, : num_slices * rank] = result
        token_offset += seq_len
    return output
 def reference_sgmv_expand(
    x: torch.Tensor,
    weights: torch.Tensor,
    batch_info: LoRABatchInfo,
    seq_lengths: List[int],
    lora_assignments: List[str],
    slice_offsets: torch.Tensor,
    max_slice_size: int,
    base_output: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    """
    Simple sequence-level reference implementation of SGMV expand operation.
    Args:
        x: (total_seq_len, num_slices * max_rank) - Intermediate activations
        weights: (num_loras, output_dim, max_rank) - LoRA B weights
        batch_info: Batch information (only used for lora_ranks)
        seq_lengths: Length of each sequence
        lora_assignments: LoRA name for each sequence
        slice_offsets: Tensor defining slice boundaries
        max_slice_size: Maximum slice size for chunking
        base_output: Optional base output to accumulate into
    Returns:
        output: (total_seq_len, total_output_dim) - Final output
    """
    if weights.numel() == 0:
        total_seq_len = x.shape[0]
        total_output_dim = slice_offsets[-1].item() if len(slice_offsets) > 0 else 0
        return torch.zeros(
            total_seq_len, total_output_dim, dtype=x.dtype, device=x.device
        )
    total_seq_len, _ = x.shape
    num_slices = len(slice_offsets) - 1
    if base_output is not None:
        output = base_output.clone()
    else:
        total_output_dim = slice_offsets[-1].item()
        output = torch.zeros(
            total_seq_len, total_output_dim, dtype=x.dtype, device=x.device
        )
    unique_loras = sorted(set(lora_assignments))
    lora_name_to_idx = {name: idx for idx, name in enumerate(unique_loras)}
    lora_ranks = batch_info.lora_ranks.cpu().numpy()
    token_offset = 0
    for seq_len, lora_name in zip(seq_lengths, lora_assignments):
        if seq_len == 0:
            continue
        lora_idx = lora_name_to_idx[lora_name]
        lora_rank = lora_ranks[lora_idx]
        if lora_rank > 0:
            # Extract sequence intermediate activations
            x_seq = x[
                token_offset : token_offset + seq_len, : num_slices * lora_rank
            ]  # (seq_len, num_slices * rank)
            for slice_idx in range(num_slices):
                slice_start_input = slice_idx * lora_rank
                slice_end_input = (slice_idx + 1) * lora_rank
                slice_start_output = slice_offsets[slice_idx].item()
                slice_end_output = slice_offsets[slice_idx + 1].item()
                x_slice = x_seq[:, slice_start_input:slice_end_input]  # (seq_len, rank)
                w_slice = weights[
                    lora_idx, slice_start_output:slice_end_output, :lora_rank
                ]  # (slice_dim, rank)
                result = safe_matmul(x_slice, w_slice.t())  # (seq_len, slice_dim)
                output[
                    token_offset : token_offset + seq_len,
                    slice_start_output:slice_end_output,
                ] += result
        token_offset += seq_len
    return output
 class TestChunkedSGMV(unittest.TestCase):
    # Test configuration constants
    RTOL = 1e-3
    ATOL = 1e-3
    DEFAULT_BATCH_SIZE = 8
    def _compare_shrink_outputs(
        self,
        chunked_output: torch.Tensor,
        reference_output: torch.Tensor,
        seq_lengths: List[int],
        lora_assignments: List[str],
        batch_info: LoRABatchInfo,
        num_slices: int,
        test_name: str,
    ):
        """
        Compare only the valid portions of shrink outputs.
        The chunked SGMV shrink kernel only guarantees correctness for
        output[seq_start:seq_end, :rank * num_slices] for each sequence.
        """
        # Create mapping from LoRA names to indices and ranks
        unique_loras = sorted(set(lora_assignments))
        lora_name_to_idx = {name: idx for idx, name in enumerate(unique_loras)}
        lora_ranks = batch_info.lora_ranks.cpu().numpy()
        token_offset = 0
        for seq_idx, (seq_len, lora_name) in enumerate(
            zip(seq_lengths, lora_assignments)
        ):
            if seq_len == 0:
                continue
            lora_idx = lora_name_to_idx[lora_name]
            rank = lora_ranks[lora_idx]
            if rank > 0:
                # Only compare the valid columns for this sequence
                valid_cols = num_slices * rank
                chunked_seq = chunked_output[
                    token_offset : token_offset + seq_len, :valid_cols
                ]
                reference_seq = reference_output[
                    token_offset : token_offset + seq_len, :valid_cols
                ]
                torch.testing.assert_close(
                    chunked_seq,
                    reference_seq,
                    rtol=self.RTOL,
                    atol=self.ATOL,
                    msg=f"Shrink operation failed for {test_name}, sequence {seq_idx} ({lora_name})",
                )
            token_offset += seq_len
    def setUp(self):
        """Set up common test parameters"""
        torch.manual_seed(42)
        random.seed(42)
        self.device = torch.device("cuda")
        self.dtype = torch.float16
        self.input_dim = 2560  # Hidden dimension
        self.max_seq_len = 1024
        # LoRA configurations: name -> (rank, output_q, output_k, output_v)
        self.lora_configs = {
            "lora_A": (8, 4096, 1024, 1024),
            "lora_B": (16, 4096, 1024, 1024),
            "lora_C": (32, 4096, 1024, 1024),
            "_NO_LORA_": (0, 4096, 1024, 1024),
        }
        # QKV slice offsets: 4096 (Q) + 1024 (K) + 1024 (V) = 6144 total
        self.slice_offsets = torch.tensor(
            [0, 4096, 5120, 6144], dtype=torch.int32, device=self.device
        )
        self.max_slice_size = 4096
    def generate_sequence_lengths(
        self,
        batch_size: int,
        batch_mode: BatchMode = BatchMode.PREFILL,
        min_len: int = 1,
        max_len: int = None,
    ) -> List[int]:
        """Generate sequence lengths for a batch based on mode"""
        if batch_mode == BatchMode.DECODE:
            return [1] * batch_size
        else:
            if max_len is None:
                max_len = self.max_seq_len
            return [random.randint(min_len, max_len) for _ in range(batch_size)]
    def create_lora_weights(
        self, lora_name: str, include_missing_k: bool = False
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Create LoRA A and B weights for given configuration"""
        rank, out_q, out_k, out_v = self.lora_configs[lora_name]
        if rank == 0:
            lora_a = torch.empty(
                0, self.input_dim, dtype=self.dtype, device=self.device
            )
            lora_b = torch.empty(
                out_q + out_k + out_v, 0, dtype=self.dtype, device=self.device
            )
            return lora_a, lora_b
        # Create LoRA A weights (3 slices for QKV)
        lora_a = torch.randn(
            3 * rank, self.input_dim, dtype=self.dtype, device=self.device
        )
        if include_missing_k:
            lora_a[rank : 2 * rank, :] = 0.0
        # Create LoRA B weights (stacked Q, K, V)
        total_output_dim = out_q + out_k + out_v
        lora_b = torch.randn(
            total_output_dim, rank, dtype=self.dtype, device=self.device
        )
        if include_missing_k:
            lora_b[out_q : out_q + out_k, :] = 0.0
        return lora_a, lora_b
    def create_batch_info(
        self,
        seq_lengths: List[int],
        lora_assignments: List[Optional[str]],
        batch_mode: BatchMode = BatchMode.PREFILL,
    ) -> LoRABatchInfo:
        """Create LoRABatchInfo using the same logic as chunked backend"""
        unique_loras = sorted(set(lora_assignments))
        lora_name_to_idx = {name: idx for idx, name in enumerate(unique_loras)}
        seq_weight_indices = [lora_name_to_idx[name] for name in lora_assignments]
        lora_ranks = [self.lora_configs[name][0] for name in unique_loras]
        def create_mock_batch():
            # Create a minimal mock ForwardBatch for the test
            class MockForwardBatch:
                def __init__(self, batch_size, seq_lengths):
                    self.batch_size = batch_size
                    self.extend_seq_lens_cpu = seq_lengths
                    self.forward_mode = MockForwardMode()
            class MockForwardMode:
                def is_extend(self):
                    return batch_mode == BatchMode.PREFILL
            return MockForwardBatch(len(seq_lengths), seq_lengths)
        mock_batch = create_mock_batch()
        # Use the same functions as chunked backend
        permutation, weights_reordered = ChunkedSgmvLoRABackend._get_permutation(
            seq_weight_indices, mock_batch
        )
        # Create a minimal backend instance to access _get_segments_info
        mock_backend = ChunkedSgmvLoRABackend(max_loras_per_batch=8, device=self.device)
        weight_indices_list, seg_indptr = mock_backend._get_segments_info(
            weights_reordered
        )
        scalings = [1.0] * len(unique_loras)
        seg_indptr_tensor = seg_indptr.to(self.device)
        weight_indices_tensor = weight_indices_list.to(self.device)
        lora_ranks_tensor = (
            torch.tensor(lora_ranks, dtype=torch.int32, device=self.device)
            if lora_ranks
            else torch.empty(0, dtype=torch.int32, device=self.device)
        )
        scalings_tensor = (
            torch.tensor(scalings, dtype=torch.float32, device=self.device)
            if scalings
            else torch.empty(0, dtype=torch.float32, device=self.device)
        )
        permutation_tensor = permutation.to(
            self.device, dtype=torch.int32
        )  # Convert to int32 for LoRABatchInfo
        seq_lens_tensor = torch.tensor(
            seq_lengths, dtype=torch.int32, device=self.device
        )
        return LoRABatchInfo(
            use_cuda_graph=False,
            bs=len(seq_lengths),
            num_segments=len(weight_indices_list),  # Number of segments, not sequences!
            seg_indptr=seg_indptr_tensor,
            weight_indices=weight_indices_tensor,
            lora_ranks=lora_ranks_tensor,
            scalings=scalings_tensor,
            seg_lens=seq_lens_tensor,  # Original sequence lengths for reference
            max_len=max(seq_lengths) if seq_lengths else 0,
            permutation=permutation_tensor,  # Token reordering permutation
        )
    def stack_lora_weights(
        self, weight_list: List[torch.Tensor], is_lora_a: bool
    ) -> torch.Tensor:
        """Stack LoRA weights from different adapters into a single tensor"""
        if not weight_list:
            return torch.empty(0, 0, 0, dtype=self.dtype, device=self.device)
        first_non_empty = next((w for w in weight_list if w.numel() > 0), None)
        if first_non_empty is None:
            return torch.empty(
                len(weight_list), 0, 0, dtype=self.dtype, device=self.device
            )
        if is_lora_a:
            # LoRA A: (slice_num * rank, input_dim) -> (num_loras, slice_num * max_rank, input_dim)
            max_rank = max(w.shape[0] // 3 if w.numel() > 0 else 0 for w in weight_list)
            final_shape = (len(weight_list), 3 * max_rank, self.input_dim)
        else:
            # LoRA B: (output_dim, rank) -> (num_loras, output_dim, max_rank)
            max_rank = max(w.shape[1] if w.numel() > 0 else 0 for w in weight_list)
            output_dim = first_non_empty.shape[0]
            final_shape = (len(weight_list), output_dim, max_rank)
        stacked = torch.zeros(final_shape, dtype=self.dtype, device=self.device)
        for i, weight in enumerate(weight_list):
            if weight.numel() > 0:
                if is_lora_a:
                    stacked[i, : weight.shape[0], :] = weight
                else:
                    stacked[i, :, : weight.shape[1]] = weight
        return stacked
    def create_test_batch(
        self,
        batch_composition: BatchComposition,
        batch_size: int,
        batch_mode: BatchMode = BatchMode.PREFILL,
        include_missing_k: bool = False,
    ) -> Tuple[
        torch.Tensor,
        Dict[str, Tuple[torch.Tensor, torch.Tensor]],
        LoRABatchInfo,
        List[int],
        List[str],
    ]:
        """Create test batch with specified composition and mode"""
        seq_lengths = self.generate_sequence_lengths(
            batch_size, batch_mode, 1, self.max_seq_len
        )
        if batch_composition == BatchComposition.UNIFORM:
            lora_assignments = ["lora_A"] * batch_size
        elif batch_composition == BatchComposition.MIXED:
            lora_names = ["lora_A", "lora_B", "lora_C", None]
            lora_assignments = [
                lora_names[i % len(lora_names)] for i in range(batch_size)
            ]
        elif batch_composition == BatchComposition.SKEWED:
            num_minority = max(1, batch_size // 8)
            lora_assignments = ["lora_A"] * num_minority + ["lora_B"] * (
                batch_size - num_minority
            )
            random.shuffle(lora_assignments)
        elif batch_composition == BatchComposition.NONE:
            lora_assignments = [None] * batch_size
        else:
            raise ValueError(f"Unknown batch composition: {batch_composition}")
        total_seq_len = sum(seq_lengths)
        x = torch.randn(
            total_seq_len, self.input_dim, dtype=self.dtype, device=self.device
        )
        normalized_assignments = [
            name if name is not None else "_NO_LORA_" for name in lora_assignments
        ]
        unique_loras = set(normalized_assignments)
        weights = {}
        for lora_name in unique_loras:
            weights[lora_name] = self.create_lora_weights(lora_name, include_missing_k)
        batch_info = self.create_batch_info(
            seq_lengths, normalized_assignments, batch_mode
        )
        return x, weights, batch_info, seq_lengths, normalized_assignments
    def run_test_comparison(
        self,
        x: torch.Tensor,
        weights: Dict[str, Tuple[torch.Tensor, torch.Tensor]],
        batch_info: LoRABatchInfo,
        seq_lengths: List[int],
        lora_assignments: List[str],
        test_name: str,
    ):
        """Run comparison between chunked and reference implementations"""
        if not weights:  # Handle case with no LoRA weights
            return
        # Stack LoRA A weights
        lora_a_weights = [weights[name][0] for name in sorted(weights.keys())]
        stacked_lora_a = self.stack_lora_weights(lora_a_weights, is_lora_a=True)
        # Stack LoRA B weights
        lora_b_weights = [weights[name][1] for name in sorted(weights.keys())]
        stacked_lora_b = self.stack_lora_weights(lora_b_weights, is_lora_a=False)
        # Test shrink operation
        chunked_shrink = chunked_sgmv_lora_shrink_forward(
            x, stacked_lora_a, batch_info, num_slices=3
        )
        reference_shrink = reference_sgmv_shrink(
            x, stacked_lora_a, batch_info, seq_lengths, lora_assignments, num_slices=3
        )
        # Only compare valid portions of shrink output (first rank * num_slices columns per sequence)
        self._compare_shrink_outputs(
            chunked_shrink,
            reference_shrink,
            seq_lengths,
            lora_assignments,
            batch_info,
            num_slices=3,
            test_name=test_name,
        )
        # Test expand operation
        chunked_expand = chunked_sgmv_lora_expand_forward(
            reference_shrink,
            stacked_lora_b,
            batch_info,
            self.slice_offsets,
            self.max_slice_size,
        )
        reference_expand = reference_sgmv_expand(
            reference_shrink,
            stacked_lora_b,
            batch_info,
            seq_lengths,
            lora_assignments,
            self.slice_offsets,
            self.max_slice_size,
        )
        torch.testing.assert_close(
            chunked_expand,
            reference_expand,
            rtol=self.RTOL,
            atol=self.ATOL,
            msg=f"Expand operation failed for {test_name}",
        )
    # === Basic Operations Tests ===
    def test_shrink_basic(self):
        """Test basic shrink operation against PyTorch reference"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(BatchComposition.UNIFORM, batch_size)
                )
                lora_a_weights = [weights[name][0] for name in sorted(weights.keys())]
                stacked_lora_a = self.stack_lora_weights(lora_a_weights, is_lora_a=True)
                chunked_shrink = chunked_sgmv_lora_shrink_forward(
                    x, stacked_lora_a, batch_info, num_slices=3
                )
                reference_shrink = reference_sgmv_shrink(
                    x,
                    stacked_lora_a,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    num_slices=3,
                )
                torch.testing.assert_close(
                    chunked_shrink, reference_shrink, rtol=self.RTOL, atol=self.ATOL
                )
    def test_expand_basic(self):
        """Test basic expand operation against PyTorch reference"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(BatchComposition.UNIFORM, batch_size)
                )
                lora_a_weights = [weights[name][0] for name in sorted(weights.keys())]
                stacked_lora_a = self.stack_lora_weights(lora_a_weights, is_lora_a=True)
                intermediate = reference_sgmv_shrink(
                    x,
                    stacked_lora_a,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    num_slices=3,
                )
                lora_b_weights = [weights[name][1] for name in sorted(weights.keys())]
                stacked_lora_b = self.stack_lora_weights(
                    lora_b_weights, is_lora_a=False
                )
                chunked_expand = chunked_sgmv_lora_expand_forward(
                    intermediate,
                    stacked_lora_b,
                    batch_info,
                    self.slice_offsets,
                    self.max_slice_size,
                )
                reference_expand = reference_sgmv_expand(
                    intermediate,
                    stacked_lora_b,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    self.slice_offsets,
                    self.max_slice_size,
                )
                torch.testing.assert_close(
                    chunked_expand, reference_expand, rtol=self.RTOL, atol=self.ATOL
                )
    # === QKV Operations Test ===
    def test_qkv_missing_projections(self):
        """Test QKV operations with missing k_proj (Qwen3 scenario)"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(
                        BatchComposition.MIXED, batch_size, include_missing_k=True
                    )
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"QKV missing k_proj batch_size={batch_size}",
                )
    # === Batch Composition Tests ===
    def test_uniform_lora_batch(self):
        """All sequences use same LoRA, random sequence lengths"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(BatchComposition.UNIFORM, batch_size)
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"uniform batch_size={batch_size}",
                )
    def test_evenly_mixed_lora_batch(self):
        """Sequences evenly distributed across LoRAs, random lengths"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(BatchComposition.MIXED, batch_size)
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"mixed batch_size={batch_size}",
                )
    def test_highly_skewed_lora_batch(self):
        """Highly uneven LoRA distribution, random lengths"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(BatchComposition.SKEWED, batch_size)
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"skewed batch_size={batch_size}",
                )
    # === Decode Mode Tests ===
    def test_decode_uniform_lora_batch(self):
        """Decode mode: All sequences use same LoRA, all length 1"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(
                        BatchComposition.UNIFORM, batch_size, BatchMode.DECODE
                    )
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"decode uniform batch_size={batch_size}",
                )
    def test_decode_mixed_lora_batch(self):
        """Decode mode: Sequences distributed across LoRAs, all length 1"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(
                        BatchComposition.MIXED, batch_size, BatchMode.DECODE
                    )
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"decode mixed batch_size={batch_size}",
                )
    def test_decode_skewed_lora_batch(self):
        """Decode mode: Highly uneven LoRA distribution, all length 1"""
        for batch_size in [1, 2, 16, 64]:
            with self.subTest(batch_size=batch_size):
                x, weights, batch_info, seq_lengths, lora_assignments = (
                    self.create_test_batch(
                        BatchComposition.SKEWED, batch_size, BatchMode.DECODE
                    )
                )
                self.run_test_comparison(
                    x,
                    weights,
                    batch_info,
                    seq_lengths,
                    lora_assignments,
                    f"decode skewed batch_size={batch_size}",
                )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -24,6 +24,7 @@ suites = {
        TestFile("lora/test_lora_update.py", 400),
        TestFile("lora/test_lora_qwen3.py", 97),
        TestFile("lora/test_lora_radix_cache.py", 100),
        TestFile("lora/test_chunked_sgmv_backend.py", 30),
        TestFile("models/test_embedding_models.py", 73),
        # TestFile("models/test_clip_models.py", 52),
        TestFile("models/test_encoder_embedding_models.py", 100),