diff --git a/python/pyproject.toml b/python/pyproject.toml index 849d9c006..d5a9cafdb 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -63,7 +63,7 @@ srt = [ "torchaudio==2.8.0", "torchvision", "cuda-python", - "flashinfer_python==0.2.11.post1", + "flashinfer_python==0.2.11.post3", ] blackwell = [ @@ -73,7 +73,7 @@ blackwell = [ "torchaudio==2.8.0", "torchvision", "cuda-python", - "flashinfer_python==0.2.11.post1", + "flashinfer_python==0.2.11.post3", ] # HIP (Heterogeneous-computing Interface for Portability) for AMD diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 79283f8bd..d2f99e739 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs): if server_args.attention_backend == "flashinfer": assert_pkg_version( "flashinfer_python", - "0.2.11.post1", + "0.2.11.post3", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index f50676c3b..00d09e69d 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -122,6 +122,7 @@ class FlashInferAttnBackend(AttentionBackend): # Allocate buffers global global_workspace_buffer if global_workspace_buffer is None: + # different from flashinfer zero_init_global_workspace_buffer global_workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, dtype=torch.uint8, diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py index 1b8dc64e5..90576a17a 100644 --- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py @@ -81,6 +81,7 @@ class FlashInferMLAAttnBackend(AttentionBackend): # Allocate buffers global global_workspace_buffer if global_workspace_buffer is None: + # different from flashinfer zero_init_global_workspace_buffer global_workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, dtype=torch.uint8, diff --git a/python/sglang/srt/layers/attention/trtllm_mha_backend.py b/python/sglang/srt/layers/attention/trtllm_mha_backend.py index 59bc12219..d8cb8aa0b 100644 --- a/python/sglang/srt/layers/attention/trtllm_mha_backend.py +++ b/python/sglang/srt/layers/attention/trtllm_mha_backend.py @@ -23,10 +23,12 @@ if TYPE_CHECKING: from sglang.srt.speculative.spec_info import SpecInfo # Constants -DEFAULT_WORKSPACE_SIZE_MB = 128 # Memory workspace size in MB +DEFAULT_WORKSPACE_SIZE_MB = ( + 512 # Memory workspace size in MB, todo(Yingyi): read from config +) # Reuse this workspace buffer across all TRTLLM MHA wrappers -global_workspace_buffer = None +global_zero_init_workspace_buffer = None @dataclass @@ -73,14 +75,14 @@ class TRTLLMHAAttnBackend(FlashInferAttnBackend): # Workspace allocation self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024 # Allocate buffers - global global_workspace_buffer - if global_workspace_buffer is None: - global_workspace_buffer = torch.empty( + global global_zero_init_workspace_buffer + if global_zero_init_workspace_buffer is None: + global_zero_init_workspace_buffer = torch.zeros( self.workspace_size, dtype=torch.uint8, device=model_runner.device, ) - self.workspace_buffer = global_workspace_buffer + self.workspace_buffer = global_zero_init_workspace_buffer # CUDA graph state self.decode_cuda_graph_metadata = {} diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py index d4ea74bf4..7aeb00d6b 100755 --- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py +++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py @@ -39,6 +39,8 @@ DEFAULT_WORKSPACE_SIZE_MB = 128 # Memory workspace size in MB # compute the LCM with other padding constraints. TRTLLM_BLOCK_CONSTRAINT = 128 +global_zero_init_workspace_buffer = None + @dataclass class TRTLLMMLADecodeMetadata: @@ -83,9 +85,14 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend): # Workspace allocation self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024 - self.workspace_buffer = torch.empty( - self.workspace_size, dtype=torch.int8, device=self.device - ) + global global_zero_init_workspace_buffer + if global_zero_init_workspace_buffer is None: + global_zero_init_workspace_buffer = torch.zeros( + self.workspace_size, + dtype=torch.uint8, + device=model_runner.device, + ) + self.workspace_buffer = global_zero_init_workspace_buffer # CUDA graph state self.decode_cuda_graph_metadata = {} diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json index 198040638..6d6af0808 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -143,4 +143,4 @@ "num_warps": 4, "num_stages": 3 } -} \ No newline at end of file +} diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json index c4ef2475d..1f3648ca8 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json @@ -143,4 +143,4 @@ "num_warps": 4, "num_stages": 4 } -} \ No newline at end of file +} diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json index efb4a52bf..8e9668889 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -143,4 +143,4 @@ "num_warps": 4, "num_stages": 3 } -} \ No newline at end of file +} diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json index 1fc57ef07..68f6fb5ac 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -143,4 +143,4 @@ "num_warps": 4, "num_stages": 3 } -} \ No newline at end of file +}