[Feature] Support Flashinfer fmha on Blackwell (#6930)
This commit is contained in:
@@ -25,6 +25,7 @@ from sglang.global_config import global_config
|
||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
|
||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||
from sglang.srt.layers.utils import is_sm100_supported
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||
from sglang.srt.utils import is_flashinfer_available, next_power_of_2
|
||||
@@ -149,8 +150,11 @@ class FlashInferAttnBackend(AttentionBackend):
|
||||
for _ in range(self.num_wrappers)
|
||||
]
|
||||
|
||||
fmha_backend = "auto"
|
||||
if is_sm100_supported():
|
||||
fmha_backend = "cutlass"
|
||||
self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
|
||||
self.workspace_buffer, "NHD"
|
||||
self.workspace_buffer, "NHD", backend=fmha_backend
|
||||
)
|
||||
|
||||
# Two wrappers: one for sliding window attention and one for full attention.
|
||||
|
||||
Reference in New Issue
Block a user