[Perf] Auto enable best flashinfer mxfp4 kernel in b200 (#8898)
This commit is contained in:
@@ -248,6 +248,7 @@ class ServerArgs:
|
||||
disable_fast_image_processor: bool = False
|
||||
enable_return_hidden_states: bool = False
|
||||
enable_triton_kernel_moe: bool = False
|
||||
enable_flashinfer_mxfp4_moe: bool = False
|
||||
|
||||
# Debug tensor dumps
|
||||
debug_tensor_dump_output_folder: Optional[str] = None
|
||||
@@ -476,18 +477,10 @@ class ServerArgs:
|
||||
or self.attention_backend == "triton"
|
||||
)
|
||||
|
||||
# Check if FlashInfer MXFP4 MoE is enabled
|
||||
from sglang.srt.utils import get_bool_env_var
|
||||
|
||||
USE_FLASHINFER_MXFP4_MOE = get_bool_env_var(
|
||||
"SGLANG_USE_FLASHINFER_MXFP4_MOE", "false"
|
||||
)
|
||||
USE_FLASHINFER_MXFP4_BF16_MOE = get_bool_env_var(
|
||||
"SGLANG_USE_FLASHINFER_MXFP4_BF16_MOE", "false"
|
||||
)
|
||||
|
||||
# Only enable Triton kernel MoE if FlashInfer is not enabled
|
||||
if not (USE_FLASHINFER_MXFP4_MOE or USE_FLASHINFER_MXFP4_BF16_MOE):
|
||||
if is_sm100_supported():
|
||||
self.enable_flashinfer_mxfp4_moe = True
|
||||
self.enable_triton_kernel_moe = False
|
||||
else:
|
||||
self.enable_triton_kernel_moe = True
|
||||
|
||||
self.disable_hybrid_swa_memory = True
|
||||
@@ -1846,6 +1839,11 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Use triton moe grouped gemm kernel.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-flashinfer-mxfp4-moe",
|
||||
action="store_true",
|
||||
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
||||
)
|
||||
|
||||
# Debug tensor dumps
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user