sglang/sgl-kernel/python/sgl_kernel/__init__.py

import ctypes
import os
import platform
import shutil
from pathlib import Path

import torch


# copy & modify from torch/utils/cpp_extension.py
def _find_cuda_home():
    """Find the CUDA install path."""
    # Guess #1
    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
    if cuda_home is None:
        # Guess #2
        nvcc_path = shutil.which("nvcc")
        if nvcc_path is not None:
            cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
        else:
            # Guess #3
            cuda_home = "/usr/local/cuda"
    return cuda_home


if torch.version.cuda is not None:
    cuda_home = Path(_find_cuda_home())

    if (cuda_home / "lib").is_dir():
        cuda_path = cuda_home / "lib"
    elif (cuda_home / "lib64").is_dir():
        cuda_path = cuda_home / "lib64"
    else:
        # Search for 'libcudart.so.12' in subdirectories
        for path in cuda_home.rglob("libcudart.so.12"):
            cuda_path = path.parent
            break
        else:
            raise RuntimeError("Could not find CUDA lib directory.")

    cuda_include = (cuda_path / "libcudart.so.12").resolve()
    if cuda_include.exists():
        ctypes.CDLL(str(cuda_include), mode=ctypes.RTLD_GLOBAL)

from sgl_kernel import common_ops
from sgl_kernel.allreduce import *
from sgl_kernel.attention import (
    cutlass_mla_decode,
    cutlass_mla_get_workspace_size,
    lightning_attention_decode,
    merge_state,
    merge_state_v2,
)
from sgl_kernel.cutlass_moe import cutlass_w4a8_moe_mm, get_cutlass_w4a8_moe_mm_data
from sgl_kernel.elementwise import (
    FusedSetKVBufferArg,
    apply_rope_with_cos_sin_cache_inplace,
    concat_mla_absorb_q,
    concat_mla_k,
    copy_to_gpu_no_ce,
    downcast_fp8,
    fused_add_rmsnorm,
    gelu_and_mul,
    gelu_tanh_and_mul,
    gemma_fused_add_rmsnorm,
    gemma_rmsnorm,
    rmsnorm,
    silu_and_mul,
)
from sgl_kernel.fused_moe import fused_marlin_moe
from sgl_kernel.gemm import (
    awq_dequantize,
    bmm_fp8,
    cutlass_scaled_fp4_mm,
    dsv3_fused_a_gemm,
    dsv3_router_gemm,
    fp8_blockwise_scaled_mm,
    fp8_scaled_mm,
    gptq_gemm,
    gptq_marlin_gemm,
    gptq_shuffle,
    int8_scaled_mm,
    qserve_w4a8_per_chn_gemm,
    qserve_w4a8_per_group_gemm,
    scaled_fp4_experts_quant,
    scaled_fp4_grouped_quant,
    scaled_fp4_quant,
    sgl_per_tensor_quant_fp8,
    sgl_per_token_group_quant_fp8,
    sgl_per_token_group_quant_int8,
    sgl_per_token_quant_fp8,
    shuffle_rows,
    silu_and_mul_scaled_fp4_grouped_quant,
)
from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
from sgl_kernel.kvcacheio import (
    transfer_kv_all_layer,
    transfer_kv_all_layer_mla,
    transfer_kv_per_layer,
    transfer_kv_per_layer_mla,
)
from sgl_kernel.mamba import causal_conv1d_fwd, causal_conv1d_update
from sgl_kernel.marlin import (
    awq_marlin_moe_repack,
    awq_marlin_repack,
    gptq_marlin_repack,
)
from sgl_kernel.memory import set_kv_buffer_kernel
from sgl_kernel.moe import (
    apply_shuffle_mul_sum,
    cutlass_fp4_group_mm,
    fp8_blockwise_scaled_grouped_mm,
    moe_align_block_size,
    moe_fused_gate,
    prepare_moe_input,
    topk_softmax,
)
from sgl_kernel.sampling import (
    min_p_sampling_from_probs,
    top_k_mask_logits,
    top_k_renorm_prob,
    top_k_top_p_sampling_from_logits,
    top_k_top_p_sampling_from_probs,
    top_p_renorm_prob,
    top_p_sampling_from_probs,
)
from sgl_kernel.speculative import (
    build_tree_kernel_efficient,
    segment_packbits,
    tree_speculative_sampling_target_only,
    verify_tree_greedy,
)
from sgl_kernel.top_k import fast_topk
from sgl_kernel.version import __version__

if torch.version.hip is not None:
    from sgl_kernel.elementwise import gelu_quick


def create_greenctx_stream_by_value(*args, **kwargs):
    from sgl_kernel.spatial import create_greenctx_stream_by_value as _impl

    return _impl(*args, **kwargs)


def get_sm_available(*args, **kwargs):
    from sgl_kernel.spatial import get_sm_available as _impl

    return _impl(*args, **kwargs)
fix undefined symbol cudaGetDriverEntryPointByVersion (#3372) 2025-02-07 19:32:45 +08:00			`import ctypes`
			`import os`
[FEATURE] Enhance platform compatibility for ARM (#5746) 2025-04-30 00:06:16 +02:00			`import platform`
CUDA Arch Independent (#8813) 2025-09-17 08:01:45 +02:00			`import shutil`
			`from pathlib import Path`
fix undefined symbol cudaGetDriverEntryPointByVersion (#3372) 2025-02-07 19:32:45 +08:00
Enable custom AR for AMD GPUs and maintain it in sgl-kernel (#3406) 2025-03-02 15:19:06 -08:00			`import torch`

[FEATURE] Enhance platform compatibility for ARM (#5746) 2025-04-30 00:06:16 +02:00
CUDA Arch Independent (#8813) 2025-09-17 08:01:45 +02:00			`# copy & modify from torch/utils/cpp_extension.py`
			`def _find_cuda_home():`
			`"""Find the CUDA install path."""`
			`# Guess #1`
			`cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")`
			`if cuda_home is None:`
			`# Guess #2`
			`nvcc_path = shutil.which("nvcc")`
			`if nvcc_path is not None:`
			`cuda_home = os.path.dirname(os.path.dirname(nvcc_path))`
			`else:`
			`# Guess #3`
			`cuda_home = "/usr/local/cuda"`
			`return cuda_home`


Fix sgl_kernel import failure on devices other than CUDA (#10610) 2025-09-19 02:38:02 +08:00			`if torch.version.cuda is not None:`
CUDA Arch Independent (#8813) 2025-09-17 08:01:45 +02:00			`cuda_home = Path(_find_cuda_home())`

			`if (cuda_home / "lib").is_dir():`
			`cuda_path = cuda_home / "lib"`
			`elif (cuda_home / "lib64").is_dir():`
			`cuda_path = cuda_home / "lib64"`
			`else:`
			`# Search for 'libcudart.so.12' in subdirectories`
			`for path in cuda_home.rglob("libcudart.so.12"):`
			`cuda_path = path.parent`
			`break`
			`else:`
			`raise RuntimeError("Could not find CUDA lib directory.")`

			`cuda_include = (cuda_path / "libcudart.so.12").resolve()`
			`if cuda_include.exists():`
			`ctypes.CDLL(str(cuda_include), mode=ctypes.RTLD_GLOBAL)`
fix undefined symbol cudaGetDriverEntryPointByVersion (#3372) 2025-02-07 19:32:45 +08:00
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel import common_ops`
			`from sgl_kernel.allreduce import *`
Blackwell Cutlass MLA kernel (#5142) 2025-04-11 22:16:51 -07:00			`from sgl_kernel.attention import (`
			`cutlass_mla_decode,`
			`cutlass_mla_get_workspace_size,`
			`lightning_attention_decode,`
feat: adapt merge_state (#5337) 2025-04-12 21:14:04 -07:00			`merge_state,`
kernel: support slightly faster merge_state_v2 cuda kernel (#5381) 2025-04-15 12:28:23 +08:00			`merge_state_v2,`
Blackwell Cutlass MLA kernel (#5142) 2025-04-11 22:16:51 -07:00			`)`
[1/n]: add cutlass W4A8 moe kernel for hopper architecture (#7772) Signed-off-by: yangsijia.614 <yangsijia.614@bytedance.com> Co-authored-by: yicwang <yichen.wang@bytedance.com> 2025-07-05 11:50:12 +08:00			`from sgl_kernel.cutlass_moe import cutlass_w4a8_moe_mm, get_cutlass_w4a8_moe_mm_data`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.elementwise import (`
Fuse writing KV buffer into rope kernel (part 1: sgl-kernel) (#9077) 2025-08-12 16:46:40 +08:00			`FusedSetKVBufferArg,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`apply_rope_with_cos_sin_cache_inplace,`
[1/2] Speed up trtllm_mla attention backend (>10% e2e) (#10473) 2025-09-16 02:53:21 +08:00			`concat_mla_absorb_q,`
[1/2] Speed up prefill mla attention (#10156) 2025-09-09 00:00:33 +08:00			`concat_mla_k,`
Support copying tensor from cpu to gpu without using copy engines (#10007) 2025-09-05 20:07:19 +08:00			`copy_to_gpu_no_ce,`
[Minor] Fix the style of sgl-kernel (#9332) 2025-08-18 23:45:00 -07:00			`downcast_fp8,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`fused_add_rmsnorm,`
			`gelu_and_mul,`
			`gelu_tanh_and_mul,`
			`gemma_fused_add_rmsnorm,`
			`gemma_rmsnorm,`
			`rmsnorm,`
			`silu_and_mul,`
			`)`
Minor style fixes for sgl-kernel (#9289) 2025-08-18 09:38:35 -07:00			`from sgl_kernel.fused_moe import fused_marlin_moe`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.gemm import (`
Add awq dequantize kernel to sgl with 1x to 3x speedup (#4104) 2025-03-12 00:10:02 -07:00			`awq_dequantize,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`bmm_fp8,`
Support FP4 gemm (1/2) (#3899) 2025-03-24 19:50:23 -07:00			`cutlass_scaled_fp4_mm,`
Add dsv3 fused a gemm to sgl-kernel (#7630) 2025-06-29 17:52:24 +08:00			`dsv3_fused_a_gemm,`
Add dsv3 router gemm kernel (#7627) 2025-06-29 23:31:55 -07:00			`dsv3_router_gemm,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`fp8_blockwise_scaled_mm,`
			`fp8_scaled_mm,`
[2/n]decouple quantization implementation from vLLM dependency (#8112) Co-authored-by: walker-ai <yiyun.wyt@antgroup.com> Co-authored-by: leoneo <1320612015@qq.com> 2025-08-14 18:19:03 +08:00			`gptq_gemm,`
			`gptq_marlin_gemm,`
			`gptq_shuffle,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`int8_scaled_mm,`
[1/2] Support Qserve (#6457) Co-authored-by: yych0745 <1398089567@qq.com> Co-authored-by: sleepcoo <sleepcoo@gmail.com> 2025-05-22 10:48:59 +08:00			`qserve_w4a8_per_chn_gemm,`
			`qserve_w4a8_per_group_gemm,`
[1/2] Add Kernel support for Cutlass based Fused FP4 MoE (#6093) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-06-02 13:48:03 -07:00			`scaled_fp4_experts_quant,`
[NVIDA] [1/N] Nvfp4 Masked Gemm: Add quant op for the flashinfer grouped gemm (#9200) 2025-08-22 12:19:45 -07:00			`scaled_fp4_grouped_quant,`
Support FP4 gemm (1/2) (#3899) 2025-03-24 19:50:23 -07:00			`scaled_fp4_quant,`
[quant kernel] sgl-kernel support per_tensor_quant fp8 (#3786) 2025-03-07 10:05:43 +08:00			`sgl_per_tensor_quant_fp8,`
Revert "[1/2] Optimizations and refactors about quant kernel (#9534)" (#10292) 2025-09-10 18:24:23 -07:00			`sgl_per_token_group_quant_fp8,`
			`sgl_per_token_group_quant_int8,`
Add sgl_per_token_quant_fp8 (#4089) 2025-03-06 20:53:05 -08:00			`sgl_per_token_quant_fp8,`
[1/2] Add Kernel support for Cutlass based Fused FP4 MoE (#6093) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-06-02 13:48:03 -07:00			`shuffle_rows,`
[NVIDA] [1/N] Nvfp4 Masked Gemm: Add quant op for the flashinfer grouped gemm (#9200) 2025-08-22 12:19:45 -07:00			`silu_and_mul_scaled_fp4_grouped_quant,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`)`
fix sgl-kernel unit tests (#5666) 2025-04-23 01:18:30 -07:00			`from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda`
kvcache io kernels and test case (#7382) 2025-06-23 11:58:59 -07:00			`from sgl_kernel.kvcacheio import (`
			`transfer_kv_all_layer,`
			`transfer_kv_all_layer_mla,`
			`transfer_kv_per_layer,`
			`transfer_kv_per_layer_mla,`
			`)`
Fix the style of sgl kernel (#10398) 2025-09-12 22:20:21 -07:00			`from sgl_kernel.mamba import causal_conv1d_fwd, causal_conv1d_update`
[1/n] apply wna16marlin kernel in moe weight only quantization (#7683) Co-authored-by: 晟海 <huangtingwei.htw@antgroup.com> Co-authored-by: yych0745 <1398089567@qq.com> Co-authored-by: HandH1998 <1335248067@qq.com> Co-authored-by: 弋云 <yiyun.wyt@antgroup.com> Co-authored-by: walker-ai <2398833647@qq.com> 2025-07-02 14:21:25 +08:00			`from sgl_kernel.marlin import (`
			`awq_marlin_moe_repack,`
			`awq_marlin_repack,`
			`gptq_marlin_repack,`
			`)`
[Feature] Support custom set kv buffer kernel (#8884) 2025-08-12 16:56:51 -07:00			`from sgl_kernel.memory import set_kv_buffer_kernel`
[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281) 2025-04-22 22:28:20 -07:00			`from sgl_kernel.moe import (`
Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916) Co-authored-by: Elfie Guo <elfiegxf@gmail.com> 2025-06-07 15:24:39 -07:00			`apply_shuffle_mul_sum,`
[1/2] Add Kernel support for Cutlass based Fused FP4 MoE (#6093) Signed-off-by: Pavani Majety <pmajety@nvidia.com> 2025-06-02 13:48:03 -07:00			`cutlass_fp4_group_mm,`
[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281) 2025-04-22 22:28:20 -07:00			`fp8_blockwise_scaled_grouped_mm,`
			`moe_align_block_size,`
			`moe_fused_gate,`
[2/2] Add python wrapper for CUTLASS FP8 Blockscale MoE Kernel. (#5694) 2025-05-16 13:14:07 -07:00			`prepare_moe_input,`
[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281) 2025-04-22 22:28:20 -07:00			`topk_softmax,`
			`)`
Rename files in sgl kernel to avoid nested folder structure (#4213) Co-authored-by: zhyncs <me@zhyncs.com> 2025-03-08 22:54:51 -08:00			`from sgl_kernel.sampling import (`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`min_p_sampling_from_probs,`
[sgl-kernel] Support FlashInfer top_k_top_p_sampling_from_logits (#9060) Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com> 2025-08-15 01:56:36 +08:00			`top_k_mask_logits,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`top_k_renorm_prob,`
[sgl-kernel] Support FlashInfer top_k_top_p_sampling_from_logits (#9060) Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com> 2025-08-15 01:56:36 +08:00			`top_k_top_p_sampling_from_logits,`
Reorganize python source files in sgl-kernel with multiple files (#4027) 2025-03-03 06:36:40 -08:00			`top_k_top_p_sampling_from_probs,`
			`top_p_renorm_prob,`
			`top_p_sampling_from_probs,`
			`)`
[Minor] Fix the style of sgl-kernel (#9332) 2025-08-18 23:45:00 -07:00			`from sgl_kernel.speculative import (`
			`build_tree_kernel_efficient,`
			`segment_packbits,`
			`tree_speculative_sampling_target_only,`
			`verify_tree_greedy,`
			`)`
			`from sgl_kernel.top_k import fast_topk`
			`from sgl_kernel.version import __version__`
Optional extension for green context (#9231) 2025-08-15 21:33:52 +08:00
Fix the style of sgl kernel (#10398) 2025-09-12 22:20:21 -07:00			`if torch.version.hip is not None:`
			`from sgl_kernel.elementwise import gelu_quick`

Optional extension for green context (#9231) 2025-08-15 21:33:52 +08:00
			`def create_greenctx_stream_by_value(args, *kwargs):`
			`from sgl_kernel.spatial import create_greenctx_stream_by_value as _impl`

			`return _impl(args, *kwargs)`


			`def get_sm_available(args, *kwargs):`
			`from sgl_kernel.spatial import get_sm_available as _impl`

			`return _impl(args, *kwargs)`