From 3fa3c22ae2c470d6ffb7f1acba805288bb7992ef Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Sep 2025 01:25:29 -0700 Subject: [PATCH] Fix fast decode plan for flashinfer v0.4.0rc1 and upgrade sgl-kernel 0.3.11 (#10634) Co-authored-by: zhyncs --- docker/Dockerfile | 2 +- python/pyproject.toml | 4 ++-- python/pyproject_other.toml | 4 ++-- python/sglang/srt/entrypoints/engine.py | 4 ++-- python/sglang/srt/layers/attention/flashinfer_backend.py | 3 +++ 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7bb535531..40b9edbb8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -85,7 +85,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \ && python3 -m flashinfer --download-cubin \ && if [ "$CUDA_VERSION" = "12.6.1" ]; then \ - python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.10/sgl_kernel-0.3.10+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \ + python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.11/sgl_kernel-0.3.11+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \ fi # Download source files diff --git a/python/pyproject.toml b/python/pyproject.toml index eaba878f9..d98a79218 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -57,12 +57,12 @@ dependencies = [ "uvicorn", "uvloop", "xgrammar==0.1.24", - "sgl-kernel==0.3.10", + "sgl-kernel==0.3.11", "torch==2.8.0", "torchaudio==2.8.0", "torchvision", "cuda-python", - "flashinfer_python==0.3.1", + "flashinfer_python==0.4.0rc1", "openai==1.99.1", "tiktoken", "anthropic>=0.20.0", diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml index 59c8db0e5..68960d0eb 100755 --- a/python/pyproject_other.toml +++ b/python/pyproject_other.toml @@ -65,7 +65,7 @@ tracing = [ srt = [ "sglang[runtime_common]", - "sgl-kernel==0.3.10", + "sgl-kernel==0.3.11", "torch==2.8.0", "torchaudio==2.8.0", "torchvision", @@ -75,7 +75,7 @@ srt = [ blackwell = [ "sglang[runtime_common]", - "sgl-kernel==0.3.10", + "sgl-kernel==0.3.11", "torch==2.8.0", "torchaudio==2.8.0", "torchvision", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 012ff5ab7..66cf2c873 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs): if server_args.attention_backend == "flashinfer": assert_pkg_version( "flashinfer_python", - "0.3.1", + "0.4.0rc1", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", @@ -711,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs): if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"): assert_pkg_version( "sgl-kernel", - "0.3.10", + "0.3.11", "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`", ) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 107ac0cbd..b761c8423 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -1432,6 +1432,9 @@ def fast_decode_plan( head_dim, head_dim, False, # causal + window_left, + -1, + False, ) except Exception as e: raise RuntimeError(f"Error in standard plan: {e}")