diff --git a/python/pyproject.toml b/python/pyproject.toml index 7471f84bf..c8a8ffc4a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -49,7 +49,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", - "sgl-kernel==0.1.8.post1", + "sgl-kernel==0.1.8.post2", "flashinfer_python==0.2.6.post1", "torch==2.7.1", "torchaudio==2.7.1", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 357146469..e53ad1a3b 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -605,7 +605,7 @@ def _set_envs_and_config(server_args: ServerArgs): if _is_cuda: assert_pkg_version( "sgl-kernel", - "0.1.8.post1", + "0.1.8.post2", "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`", ) diff --git a/python/sglang/srt/layers/attention/cutlass_mla_backend.py b/python/sglang/srt/layers/attention/cutlass_mla_backend.py index 416eff724..8b3d18602 100644 --- a/python/sglang/srt/layers/attention/cutlass_mla_backend.py +++ b/python/sglang/srt/layers/attention/cutlass_mla_backend.py @@ -280,6 +280,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend): seq_lens=forward_batch.seq_lens.to(torch.int32), page_table=self.forward_metadata.block_kv_indices, workspace=self.forward_metadata.workspace, + sm_scale=layer.scaling, num_kv_splits=1, ) diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py index e407a3598..582fcc9b4 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py @@ -52,7 +52,11 @@ def grouped_gemm_nt_f8f8bf16_masked( expected_m, n, k, num_groups, kernel_type ): _grouped_gemm_nt_f8f8bf16_masked_raw( - lhs, rhs, out, masked_m, expected_m, + lhs, + rhs, + out, + masked_m, + expected_m, **({"recipe": recipe} if DEEPGEMM_V202506 else {}) )