[4/4] Introduce CachedKernel to reduce CSGMV kernel launch overheads by 60% (#10709)

2025-09-21 22:26:42 -07:00
parent 1c3dbad8fe
commit 635ccda673
3 changed files with 118 additions and 0 deletions
--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
@@ -5,8 +5,10 @@ import triton
 import triton.language as tl

 from sglang.srt.lora.utils import LoRABatchInfo
+from sglang.utils import cached_triton_kernel


+@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
@triton.jit
 def _chunked_lora_expand_kernel(
    # Pointers to matrices
--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
@@ -3,8 +3,10 @@ import triton
 import triton.language as tl

 from sglang.srt.lora.utils import LoRABatchInfo
+from sglang.utils import cached_triton_kernel


+@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
@triton.jit
 def _chunked_lora_shrink_kernel(
    # Pointers to matrices