[4/4] Introduce CachedKernel to reduce CSGMV kernel launch overheads by 60% (#10709)

This commit is contained in:
Lifu Huang
2025-09-21 22:26:42 -07:00
committed by GitHub
parent 1c3dbad8fe
commit 635ccda673
3 changed files with 118 additions and 0 deletions

View File

@@ -5,8 +5,10 @@ import triton
import triton.language as tl
from sglang.srt.lora.utils import LoRABatchInfo
from sglang.utils import cached_triton_kernel
@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
@triton.jit
def _chunked_lora_expand_kernel(
# Pointers to matrices

View File

@@ -3,8 +3,10 @@ import triton
import triton.language as tl
from sglang.srt.lora.utils import LoRABatchInfo
from sglang.utils import cached_triton_kernel
@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
@triton.jit
def _chunked_lora_shrink_kernel(
# Pointers to matrices