sglang/sgl-kernel/python/sgl_kernel/__init__.py

import ctypes
import os

import torch

if os.path.exists("/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12"):
    ctypes.CDLL(
        "/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12",
        mode=ctypes.RTLD_GLOBAL,
    )

from sgl_kernel import common_ops
from sgl_kernel.allreduce import *
from sgl_kernel.attention import lightning_attention_decode
from sgl_kernel.elementwise import (
    apply_rope_with_cos_sin_cache_inplace,
    fused_add_rmsnorm,
    gelu_and_mul,
    gelu_tanh_and_mul,
    gemma_fused_add_rmsnorm,
    gemma_rmsnorm,
    rmsnorm,
    silu_and_mul,
)
from sgl_kernel.gemm import (
    awq_dequantize,
    bmm_fp8,
    cublas_grouped_gemm,
    cutlass_scaled_fp4_mm,
    fp8_blockwise_scaled_mm,
    fp8_scaled_mm,
    int8_scaled_mm,
    scaled_fp4_quant,
    sgl_per_tensor_quant_fp8,
    sgl_per_token_group_quant_fp8,
    sgl_per_token_group_quant_int8,
    sgl_per_token_quant_fp8,
)
from sgl_kernel.moe import moe_align_block_size, moe_fused_gate, topk_softmax
from sgl_kernel.sampling import (
    min_p_sampling_from_probs,
    top_k_renorm_prob,
    top_k_top_p_sampling_from_probs,
    top_p_renorm_prob,
    top_p_sampling_from_probs,
)
from sgl_kernel.speculative import (
    build_tree_kernel_efficient,
    segment_packbits,
    tree_speculative_sampling_target_only,
    verify_tree_greedy,
)
from sgl_kernel.version import __version__

build_tree_kernel = (
    None  # TODO(ying): remove this after updating the sglang python code.
)
fix undefined symbol cudaGetDriverEntryPointByVersion (#3372) 2025-02-07 19:32:45 +08:00			`import ctypes`
			`import os`

Enable custom AR for AMD GPUs and maintain it in sgl-kernel (#3406) 2025-03-02 15:19:06 -08:00			`import torch`

fix undefined symbol cudaGetDriverEntryPointByVersion (#3372) 2025-02-07 19:32:45 +08:00			`if os.path.exists("/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12"):`
			`ctypes.CDLL(`
			`"/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12",`
			`mode=ctypes.RTLD_GLOBAL,`
			`)`

Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel import common_ops`
			`from sgl_kernel.allreduce import *`
			`from sgl_kernel.attention import lightning_attention_decode`
			`from sgl_kernel.elementwise import (`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`apply_rope_with_cos_sin_cache_inplace,`
			`fused_add_rmsnorm,`
			`gelu_and_mul,`
			`gelu_tanh_and_mul,`
			`gemma_fused_add_rmsnorm,`
			`gemma_rmsnorm,`
			`rmsnorm,`
			`silu_and_mul,`
			`)`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.gemm import (`
Add awq dequantize kernel to sgl with 1x to 3x speedup (#4104) 2025-03-12 00:10:02 -07:00			`awq_dequantize,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`bmm_fp8,`
			`cublas_grouped_gemm,`
Support FP4 gemm (1/2) (#3899) 2025-03-24 19:50:23 -07:00			`cutlass_scaled_fp4_mm,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`fp8_blockwise_scaled_mm,`
			`fp8_scaled_mm,`
			`int8_scaled_mm,`
Support FP4 gemm (1/2) (#3899) 2025-03-24 19:50:23 -07:00			`scaled_fp4_quant,`
[quant kernel] sgl-kernel support per_tensor_quant fp8 (#3786) 2025-03-07 10:05:43 +08:00			`sgl_per_tensor_quant_fp8,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`sgl_per_token_group_quant_fp8,`
[Quant Kernel] refactored per token group quant fp8 to support int8 up-to 2x faster (#4396) 2025-03-23 23:44:17 -07:00			`sgl_per_token_group_quant_int8,`
Add sgl_per_token_quant_fp8 (#4089) 2025-03-06 20:53:05 -08:00			`sgl_per_token_quant_fp8,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`)`
Add deepseek style fused moe group gate selection kernel (#4530) 2025-03-29 11:51:45 -07:00			`from sgl_kernel.moe import moe_align_block_size, moe_fused_gate, topk_softmax`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.sampling import (`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`min_p_sampling_from_probs,`
			`top_k_renorm_prob,`
			`top_k_top_p_sampling_from_probs,`
			`top_p_renorm_prob,`
			`top_p_sampling_from_probs,`
			`)`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.speculative import (`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`build_tree_kernel_efficient,`
Add greedy verification kernel (#4383) 2025-03-16 00:58:26 -07:00			`segment_packbits,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`tree_speculative_sampling_target_only,`
Add greedy verification kernel (#4383) 2025-03-16 00:58:26 -07:00			`verify_tree_greedy,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`)`
Improve code styles (#4021) 2025-03-03 03:20:23 -08:00			`from sgl_kernel.version import __version__`
Add greedy verification kernel (#4383) 2025-03-16 00:58:26 -07:00
			`build_tree_kernel = (`
			`None # TODO(ying): remove this after updating the sglang python code.`
			`)`