[3/4] Speed up CSGMV backend perf by 10% through dynamic chunking + kernel optimization (#10592)

This commit is contained in:
Lifu Huang
2025-09-20 22:47:48 -07:00
committed by GitHub
parent 720c1c8ca3
commit 08ecd0aa2a
10 changed files with 158 additions and 84 deletions

View File

@@ -19,6 +19,9 @@ class LoRABatchInfo:
# Number of segments. For triton backend, it is equal to batch size.
num_segments: int
# Maximum segment length of current batch
max_len: int
# Indice pointers of each segment in shape (num_segments + 1, )
seg_indptr: torch.Tensor
@@ -34,9 +37,6 @@ class LoRABatchInfo:
# Lengths of each segments in shape (num_segments,)
seg_lens: Optional[torch.Tensor]
# Maximum segment length of current batch
max_len: Optional[int]
# The logical (re)ordering of input rows (tokens), in shape (num_tokens,)
permutation: Optional[torch.Tensor]