[Perf] Auto enable best flashinfer mxfp4 kernel in b200 (#8898)

This commit is contained in:
Xiaoyu Zhang
2025-08-07 16:08:41 +08:00
committed by GitHub
parent c36a6693f3
commit 47824c1488
5 changed files with 48 additions and 48 deletions

View File

@@ -248,6 +248,7 @@ class ServerArgs:
disable_fast_image_processor: bool = False
enable_return_hidden_states: bool = False
enable_triton_kernel_moe: bool = False
enable_flashinfer_mxfp4_moe: bool = False
# Debug tensor dumps
debug_tensor_dump_output_folder: Optional[str] = None
@@ -476,18 +477,10 @@ class ServerArgs:
or self.attention_backend == "triton"
)
# Check if FlashInfer MXFP4 MoE is enabled
from sglang.srt.utils import get_bool_env_var
USE_FLASHINFER_MXFP4_MOE = get_bool_env_var(
"SGLANG_USE_FLASHINFER_MXFP4_MOE", "false"
)
USE_FLASHINFER_MXFP4_BF16_MOE = get_bool_env_var(
"SGLANG_USE_FLASHINFER_MXFP4_BF16_MOE", "false"
)
# Only enable Triton kernel MoE if FlashInfer is not enabled
if not (USE_FLASHINFER_MXFP4_MOE or USE_FLASHINFER_MXFP4_BF16_MOE):
if is_sm100_supported():
self.enable_flashinfer_mxfp4_moe = True
self.enable_triton_kernel_moe = False
else:
self.enable_triton_kernel_moe = True
self.disable_hybrid_swa_memory = True
@@ -1846,6 +1839,11 @@ class ServerArgs:
action="store_true",
help="Use triton moe grouped gemm kernel.",
)
parser.add_argument(
"--enable-flashinfer-mxfp4-moe",
action="store_true",
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
)
# Debug tensor dumps
parser.add_argument(