Fix fast decode plan for flashinfer v0.4.0rc1 and upgrade sgl-kernel 0.3.11 (#10634)

Co-authored-by: zhyncs <me@zhyncs.com>
2025-09-19 01:25:29 -07:00
parent 4f2055ad56
commit 3fa3c22ae2
5 changed files with 10 additions and 7 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -57,12 +57,12 @@ dependencies = [
  "uvicorn",
  "uvloop",
  "xgrammar==0.1.24",
-  "sgl-kernel==0.3.10",
+  "sgl-kernel==0.3.11",
  "torch==2.8.0",
  "torchaudio==2.8.0",
  "torchvision",
  "cuda-python",
-  "flashinfer_python==0.3.1",
+  "flashinfer_python==0.4.0rc1",
  "openai==1.99.1",
  "tiktoken",
  "anthropic>=0.20.0",
--- a/python/pyproject_other.toml
+++ b/python/pyproject_other.toml
@@ -65,7 +65,7 @@ tracing = [

 srt = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.3.10",
+    "sgl-kernel==0.3.11",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
@@ -75,7 +75,7 @@ srt = [

 blackwell = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.3.10",
+    "sgl-kernel==0.3.11",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if server_args.attention_backend == "flashinfer":
        assert_pkg_version(
            "flashinfer_python",
-            "0.3.1",
+            "0.4.0rc1",
            "Please uninstall the old version and "
            "reinstall the latest version by following the instructions "
            "at https://docs.flashinfer.ai/installation.html.",
@@ -711,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
        assert_pkg_version(
            "sgl-kernel",
-            "0.3.10",
+            "0.3.11",
            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
        )

--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -1432,6 +1432,9 @@ def fast_decode_plan(
                    head_dim,
                    head_dim,
                    False,  # causal
+                    window_left,
+                    -1,
+                    False,
                )
            except Exception as e:
                raise RuntimeError(f"Error in standard plan: {e}")