fix: zero_init buffer (#9065)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
2025-08-14 05:39:09 -04:00
parent 3d6be1fbce
commit 4dbf43601d
10 changed files with 27 additions and 16 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -63,7 +63,7 @@ srt = [
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
-    "flashinfer_python==0.2.11.post1",
+    "flashinfer_python==0.2.11.post3",
 ]
 blackwell = [
@@ -73,7 +73,7 @@ blackwell = [
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
-    "flashinfer_python==0.2.11.post1",
+    "flashinfer_python==0.2.11.post3",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if server_args.attention_backend == "flashinfer":
        assert_pkg_version(
            "flashinfer_python",
-            "0.2.11.post1",
+            "0.2.11.post3",
            "Please uninstall the old version and "
            "reinstall the latest version by following the instructions "
            "at https://docs.flashinfer.ai/installation.html.",
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -122,6 +122,7 @@ class FlashInferAttnBackend(AttentionBackend):
        # Allocate buffers
        global global_workspace_buffer
        if global_workspace_buffer is None:
            # different from flashinfer zero_init_global_workspace_buffer
            global_workspace_buffer = torch.empty(
                global_config.flashinfer_workspace_size,
                dtype=torch.uint8,
--- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -81,6 +81,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
        # Allocate buffers
        global global_workspace_buffer
        if global_workspace_buffer is None:
            # different from flashinfer zero_init_global_workspace_buffer
            global_workspace_buffer = torch.empty(
                global_config.flashinfer_workspace_size,
                dtype=torch.uint8,
--- a/python/sglang/srt/layers/attention/trtllm_mha_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mha_backend.py
@@ -23,10 +23,12 @@ if TYPE_CHECKING:
    from sglang.srt.speculative.spec_info import SpecInfo
 # Constants
-DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
+DEFAULT_WORKSPACE_SIZE_MB = (
    512  # Memory workspace size in MB, todo(Yingyi): read from config
 )
 # Reuse this workspace buffer across all TRTLLM MHA wrappers
-global_workspace_buffer = None
+global_zero_init_workspace_buffer = None
@dataclass
@@ -73,14 +75,14 @@ class TRTLLMHAAttnBackend(FlashInferAttnBackend):
        # Workspace allocation
        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
        # Allocate buffers
-        global global_workspace_buffer
+        global global_zero_init_workspace_buffer
-        if global_workspace_buffer is None:
+        if global_zero_init_workspace_buffer is None:
-            global_workspace_buffer = torch.empty(
+            global_zero_init_workspace_buffer = torch.zeros(
                self.workspace_size,
                dtype=torch.uint8,
                device=model_runner.device,
            )
-        self.workspace_buffer = global_workspace_buffer
+        self.workspace_buffer = global_zero_init_workspace_buffer
        # CUDA graph state
        self.decode_cuda_graph_metadata = {}
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -39,6 +39,8 @@ DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
 # compute the LCM with other padding constraints.
 TRTLLM_BLOCK_CONSTRAINT = 128
 global_zero_init_workspace_buffer = None
@dataclass
 class TRTLLMMLADecodeMetadata:
@@ -83,9 +85,14 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend):
        # Workspace allocation
        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
-        self.workspace_buffer = torch.empty(
+        global global_zero_init_workspace_buffer
-            self.workspace_size, dtype=torch.int8, device=self.device
+        if global_zero_init_workspace_buffer is None:
-        )
+            global_zero_init_workspace_buffer = torch.zeros(
                self.workspace_size,
                dtype=torch.uint8,
                device=model_runner.device,
            )
        self.workspace_buffer = global_zero_init_workspace_buffer
        # CUDA graph state
        self.decode_cuda_graph_metadata = {}
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
@@ -143,4 +143,4 @@
        "num_warps": 4,
        "num_stages": 3
    }
-}
+}
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
@@ -143,4 +143,4 @@
        "num_warps": 4,
        "num_stages": 4
    }
-}
+}
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
@@ -143,4 +143,4 @@
        "num_warps": 4,
        "num_stages": 3
    }
-}
+}
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,
@@ -143,4 +143,4 @@
        "num_warps": 4,
        "num_stages": 3
    }
-}
+}