diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 265e1374b..3b33b319d 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -266,7 +266,7 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
-          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 10
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 6078abdd2..3336823b3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -44,7 +44,7 @@ runtime_common = [
 
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.4",
+    "sgl-kernel==0.0.4.post1",
     "flashinfer_python==0.2.2.post1",
     "torch==2.5.1",
     "vllm>=0.6.4.post1,<=0.7.2",
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 5aafcc270..6c52709f7 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -480,7 +480,7 @@ class ServerArgs:
             "--chunked-prefill-size",
             type=int,
             default=ServerArgs.chunked_prefill_size,
-            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.",
         )
         parser.add_argument(
             "--max-prefill-tokens",
@@ -505,7 +505,7 @@ class ServerArgs:
             "--cpu-offload-gb",
             type=int,
             default=ServerArgs.cpu_offload_gb,
-            help="How many GBs of RAM to reserve for CPU offloading",
+            help="How many GBs of RAM to reserve for CPU offloading.",
         )
 
         # Other runtime options
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 408adbaf8..b1cf2eb44 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -26,4 +26,4 @@ pip install transformers==4.45.2 sentence_transformers accelerate peft pandas da
 pip install cuda-python nvidia-cuda-nvrtc-cu12
 
 # reinstall sgl-kernel
-pip install sgl-kernel==0.0.4 --force-reinstall --no-deps
+pip install sgl-kernel==0.0.4.post1 --force-reinstall --no-deps