[3/4] Speed up CSGMV backend perf by 10% through dynamic chunking + kernel optimization (#10592)

2025-09-20 22:47:48 -07:00
parent 720c1c8ca3
commit 08ecd0aa2a
10 changed files with 158 additions and 84 deletions
--- a/python/sglang/srt/lora/utils.py
+++ b/python/sglang/srt/lora/utils.py
@@ -19,6 +19,9 @@ class LoRABatchInfo:
    # Number of segments. For triton backend, it is equal to batch size.
    num_segments: int

+    # Maximum segment length of current batch
+    max_len: int
+
    # Indice pointers of each segment in shape (num_segments + 1, )
    seg_indptr: torch.Tensor

@@ -34,9 +37,6 @@ class LoRABatchInfo:
    # Lengths of each segments in shape (num_segments,)
    seg_lens: Optional[torch.Tensor]

-    # Maximum segment length of current batch
-    max_len: Optional[int]
-
    # The logical (re)ordering of input rows (tokens), in shape (num_tokens,)
    permutation: Optional[torch.Tensor]