From 44cb060785b82719a26b7e78f7116cb2393a390f Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 9 Oct 2025 14:17:54 -0700 Subject: [PATCH] chore: upgrade flashinfer 0.4.0 (#11364) --- python/pyproject.toml | 2 +- python/pyproject_other.toml | 2 +- python/sglang/srt/entrypoints/engine.py | 2 +- python/sglang/srt/layers/attention/flashinfer_mla_backend.py | 2 +- scripts/ci/ci_install_dependency.sh | 2 ++ 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 78ee0041a..5d7c74a8a 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "datasets", "einops", "fastapi", - "flashinfer_python==0.4.0rc3", + "flashinfer_python==0.4.0", "hf_transfer", "huggingface_hub", "interegular", diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml index 4d20b593b..9f3861a51 100755 --- a/python/pyproject_other.toml +++ b/python/pyproject_other.toml @@ -70,7 +70,7 @@ srt = [ "torchaudio==2.8.0", "torchvision", "cuda-python", - "flashinfer_python==0.4.0rc3", + "flashinfer_python==0.4.0", ] # HIP (Heterogeneous-computing Interface for Portability) for AMD diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 7f5d74302..a9f88dbf8 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs): if server_args.attention_backend == "flashinfer": assert_pkg_version( "flashinfer_python", - "0.4.0rc3", + "0.4.0", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py index e785b6013..21e6772bf 100644 --- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py @@ -1060,7 +1060,7 @@ def fast_mla_decode_plan( try: # Standard version with just the required arguments (no use_profiler) - self._cached_module.plan.default( + self._cached_module.plan( self._float_workspace_buffer, self._int_workspace_buffer, self._pin_memory_int_workspace_buffer, diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh index b3502e342..f6cf46dff 100755 --- a/scripts/ci/ci_install_dependency.sh +++ b/scripts/ci/ci_install_dependency.sh @@ -74,3 +74,5 @@ fi # Show current packages $PIP_CMD list python3 -c "import torch; print(torch.version.cuda)" + +python3 -m flashinfer clear-cache