From 44cb060785b82719a26b7e78f7116cb2393a390f Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Thu, 9 Oct 2025 14:17:54 -0700
Subject: [PATCH] chore: upgrade flashinfer 0.4.0 (#11364)

---
 python/pyproject.toml                                        | 2 +-
 python/pyproject_other.toml                                  | 2 +-
 python/sglang/srt/entrypoints/engine.py                      | 2 +-
 python/sglang/srt/layers/attention/flashinfer_mla_backend.py | 2 +-
 scripts/ci/ci_install_dependency.sh                          | 2 ++
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 78ee0041a..5d7c74a8a 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -24,7 +24,7 @@ dependencies = [
   "datasets",
   "einops",
   "fastapi",
-  "flashinfer_python==0.4.0rc3",
+  "flashinfer_python==0.4.0",
   "hf_transfer",
   "huggingface_hub",
   "interegular",
diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml
index 4d20b593b..9f3861a51 100755
--- a/python/pyproject_other.toml
+++ b/python/pyproject_other.toml
@@ -70,7 +70,7 @@ srt = [
     "torchaudio==2.8.0",
     "torchvision",
     "cuda-python",
-    "flashinfer_python==0.4.0rc3",
+    "flashinfer_python==0.4.0",
 ]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 7f5d74302..a9f88dbf8 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.4.0rc3",
+            "0.4.0",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
index e785b6013..21e6772bf 100644
--- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -1060,7 +1060,7 @@ def fast_mla_decode_plan(
 
     try:
         # Standard version with just the required arguments (no use_profiler)
-        self._cached_module.plan.default(
+        self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
             self._pin_memory_int_workspace_buffer,
diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh
index b3502e342..f6cf46dff 100755
--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -74,3 +74,5 @@ fi
 # Show current packages
 $PIP_CMD list
 python3 -c "import torch; print(torch.version.cuda)"
+
+python3 -m flashinfer clear-cache