diff --git a/docs/developer_guide/contribution_guide.md b/docs/developer_guide/contribution_guide.md
index 337ff77d2..e2171f447 100644
--- a/docs/developer_guide/contribution_guide.md
+++ b/docs/developer_guide/contribution_guide.md
@@ -76,11 +76,12 @@ If you modify files protected by code owners, their approval is required to merg
 - Try to make functions as pure as possible. Avoid in-place modification of arguments.
 
 ## How to update sgl-kernel
-Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR). To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
+Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR).
+To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
 
 Follow these steps:
 
-1. Submit a PR to update the sgl-kernel source code without using it (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
+1. Submit a PR to update the sgl-kernel source code without using it in sglang python package (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
 2. Bump the version of sgl-kernel (e.g., [#9220](https://github.com/sgl-project/sglang/pull/9220/files)).
    - Once merged, this will trigger an automatic release of the sgl-kernel wheel to PyPI.
    - If not urgent, you can wait for other people to release the wheel. A new version will typically be released within one week.
diff --git a/sgl-kernel/python/sgl_kernel/__init__.py b/sgl-kernel/python/sgl_kernel/__init__.py
index 515aa4adf..6480a097d 100755
--- a/sgl-kernel/python/sgl_kernel/__init__.py
+++ b/sgl-kernel/python/sgl_kernel/__init__.py
@@ -23,6 +23,7 @@ from sgl_kernel.cutlass_moe import cutlass_w4a8_moe_mm, get_cutlass_w4a8_moe_mm_
 from sgl_kernel.elementwise import (
     FusedSetKVBufferArg,
     apply_rope_with_cos_sin_cache_inplace,
+    downcast_fp8,
     fused_add_rmsnorm,
     gelu_and_mul,
     gelu_tanh_and_mul,
@@ -92,6 +93,14 @@ from sgl_kernel.sampling import (
     top_p_renorm_prob,
     top_p_sampling_from_probs,
 )
+from sgl_kernel.speculative import (
+    build_tree_kernel_efficient,
+    segment_packbits,
+    tree_speculative_sampling_target_only,
+    verify_tree_greedy,
+)
+from sgl_kernel.top_k import fast_topk
+from sgl_kernel.version import __version__
 
 
 def create_greenctx_stream_by_value(*args, **kwargs):
@@ -104,13 +113,3 @@ def get_sm_available(*args, **kwargs):
     from sgl_kernel.spatial import get_sm_available as _impl
 
     return _impl(*args, **kwargs)
-
-
-from sgl_kernel.speculative import (
-    build_tree_kernel_efficient,
-    segment_packbits,
-    tree_speculative_sampling_target_only,
-    verify_tree_greedy,
-)
-from sgl_kernel.top_k import fast_topk
-from sgl_kernel.version import __version__
diff --git a/sgl-kernel/setup_rocm.py b/sgl-kernel/setup_rocm.py
index 02c2019ff..2105c7c1f 100644
--- a/sgl-kernel/setup_rocm.py
+++ b/sgl-kernel/setup_rocm.py
@@ -43,12 +43,12 @@ include_dirs = [
 sources = [
     "csrc/allreduce/custom_all_reduce.hip",
     "csrc/allreduce/quick_all_reduce.cu",
+    "csrc/common_extension_rocm.cc",
     "csrc/elementwise/activation.cu",
+    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
     "csrc/moe/moe_align_kernel.cu",
     "csrc/moe/moe_topk_softmax_kernels.cu",
     "csrc/speculative/eagle_utils.cu",
-    "csrc/common_extension_rocm.cc",
-    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
 ]
 
 cxx_flags = ["-O3"]