Fix fast decode plan for flashinfer v0.4.0rc1 and upgrade sgl-kernel 0.3.11 (#10634)

Co-authored-by: zhyncs <me@zhyncs.com>
This commit is contained in:
Baizhou Zhang
2025-09-19 01:25:29 -07:00
committed by GitHub
parent 4f2055ad56
commit 3fa3c22ae2
5 changed files with 10 additions and 7 deletions

View File

@@ -57,12 +57,12 @@ dependencies = [
"uvicorn",
"uvloop",
"xgrammar==0.1.24",
"sgl-kernel==0.3.10",
"sgl-kernel==0.3.11",
"torch==2.8.0",
"torchaudio==2.8.0",
"torchvision",
"cuda-python",
"flashinfer_python==0.3.1",
"flashinfer_python==0.4.0rc1",
"openai==1.99.1",
"tiktoken",
"anthropic>=0.20.0",

View File

@@ -65,7 +65,7 @@ tracing = [
srt = [
"sglang[runtime_common]",
"sgl-kernel==0.3.10",
"sgl-kernel==0.3.11",
"torch==2.8.0",
"torchaudio==2.8.0",
"torchvision",
@@ -75,7 +75,7 @@ srt = [
blackwell = [
"sglang[runtime_common]",
"sgl-kernel==0.3.10",
"sgl-kernel==0.3.11",
"torch==2.8.0",
"torchaudio==2.8.0",
"torchvision",

View File

@@ -703,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if server_args.attention_backend == "flashinfer":
assert_pkg_version(
"flashinfer_python",
"0.3.1",
"0.4.0rc1",
"Please uninstall the old version and "
"reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html.",
@@ -711,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
assert_pkg_version(
"sgl-kernel",
"0.3.10",
"0.3.11",
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
)

View File

@@ -1432,6 +1432,9 @@ def fast_decode_plan(
head_dim,
head_dim,
False, # causal
window_left,
-1,
False,
)
except Exception as e:
raise RuntimeError(f"Error in standard plan: {e}")