[Feature] Support Flashinfer fmha on Blackwell (#6930)

This commit is contained in:
Jianan Ji
2025-06-06 15:57:50 -04:00
committed by GitHub
parent b819381fec
commit 5f91c82526
5 changed files with 18 additions and 9 deletions

View File

@@ -25,6 +25,7 @@ from sglang.global_config import global_config
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.layers.utils import is_sm100_supported
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
from sglang.srt.utils import is_flashinfer_available, next_power_of_2
@@ -149,8 +150,11 @@ class FlashInferAttnBackend(AttentionBackend):
for _ in range(self.num_wrappers)
]
fmha_backend = "auto"
if is_sm100_supported():
fmha_backend = "cutlass"
self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
self.workspace_buffer, "NHD"
self.workspace_buffer, "NHD", backend=fmha_backend
)
# Two wrappers: one for sliding window attention and one for full attention.