Replace enable_flashinfer_mla argument with attention_backend (#5005)

This commit is contained in:
Baizhou Zhang
2025-04-03 02:53:58 -07:00
committed by GitHub
parent 772d2a191d
commit e8999b13b7
8 changed files with 21 additions and 20 deletions

View File

@@ -684,7 +684,6 @@ class DeepseekV2AttentionMLA(nn.Module):
self.w_vc = None
self.w_scale = None
self.enable_flashinfer_mla = global_server_args_dict["enable_flashinfer_mla"]
self.flashinfer_mla_disable_ragged = global_server_args_dict[
"flashinfer_mla_disable_ragged"
]
@@ -692,7 +691,7 @@ class DeepseekV2AttentionMLA(nn.Module):
self.rocm_fused_decode_mla = os.getenv("SGLANG_ROCM_FUSED_DECODE_MLA") == "1"
def no_absorb(self, forward_batch: ForwardBatch) -> bool:
if self.enable_flashinfer_mla:
if self.attention_backend == "flashinfer_mla":
# Flashinfer MLA: Do not absorb when enabling ragged prefill
return (
not self.flashinfer_mla_disable_ragged