[4/4] Introduce CachedKernel to reduce CSGMV kernel launch overheads by 60% (#10709)
This commit is contained in:
@@ -5,8 +5,10 @@ import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.lora.utils import LoRABatchInfo
|
||||
from sglang.utils import cached_triton_kernel
|
||||
|
||||
|
||||
@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
|
||||
@triton.jit
|
||||
def _chunked_lora_expand_kernel(
|
||||
# Pointers to matrices
|
||||
|
||||
@@ -3,8 +3,10 @@ import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.lora.utils import LoRABatchInfo
|
||||
from sglang.utils import cached_triton_kernel
|
||||
|
||||
|
||||
@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
|
||||
@triton.jit
|
||||
def _chunked_lora_shrink_kernel(
|
||||
# Pointers to matrices
|
||||
|
||||
Reference in New Issue
Block a user