From 3fa3c22ae2c470d6ffb7f1acba805288bb7992ef Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Sep 2025 01:25:29 -0700
Subject: [PATCH] Fix fast decode plan for flashinfer v0.4.0rc1 and upgrade
 sgl-kernel 0.3.11 (#10634)

Co-authored-by: zhyncs <me@zhyncs.com>
---
 docker/Dockerfile                                        | 2 +-
 python/pyproject.toml                                    | 4 ++--
 python/pyproject_other.toml                              | 4 ++--
 python/sglang/srt/entrypoints/engine.py                  | 4 ++--
 python/sglang/srt/layers/attention/flashinfer_backend.py | 3 +++
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7bb535531..40b9edbb8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
  && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
  && python3 -m flashinfer --download-cubin \
  && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.10/sgl_kernel-0.3.10+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.11/sgl_kernel-0.3.11+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
     fi
 
 # Download source files
diff --git a/python/pyproject.toml b/python/pyproject.toml
index eaba878f9..d98a79218 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -57,12 +57,12 @@ dependencies = [
   "uvicorn",
   "uvloop",
   "xgrammar==0.1.24",
-  "sgl-kernel==0.3.10",
+  "sgl-kernel==0.3.11",
   "torch==2.8.0",
   "torchaudio==2.8.0",
   "torchvision",
   "cuda-python",
-  "flashinfer_python==0.3.1",
+  "flashinfer_python==0.4.0rc1",
   "openai==1.99.1",
   "tiktoken",
   "anthropic>=0.20.0",
diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml
index 59c8db0e5..68960d0eb 100755
--- a/python/pyproject_other.toml
+++ b/python/pyproject_other.toml
@@ -65,7 +65,7 @@ tracing = [
 
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.3.10",
+    "sgl-kernel==0.3.11",
     "torch==2.8.0",
     "torchaudio==2.8.0",
     "torchvision",
@@ -75,7 +75,7 @@ srt = [
 
 blackwell = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.3.10",
+    "sgl-kernel==0.3.11",
     "torch==2.8.0",
     "torchaudio==2.8.0",
     "torchvision",
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 012ff5ab7..66cf2c873 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.3.1",
+            "0.4.0rc1",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -711,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.3.10",
+            "0.3.11",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
 
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index 107ac0cbd..b761c8423 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -1432,6 +1432,9 @@ def fast_decode_plan(
                     head_dim,
                     head_dim,
                     False,  # causal
+                    window_left,
+                    -1,
+                    False,
                 )
             except Exception as e:
                 raise RuntimeError(f"Error in standard plan: {e}")