Better optimization log for gpt-oss model (#8953)
This commit is contained in:
@@ -488,8 +488,14 @@ class ServerArgs:
|
||||
if is_sm100_supported() and is_mxfp4_quant_format:
|
||||
self.enable_flashinfer_mxfp4_moe = True
|
||||
self.enable_triton_kernel_moe = False
|
||||
logger.info(
|
||||
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
||||
)
|
||||
else:
|
||||
self.enable_triton_kernel_moe = True
|
||||
logger.info(
|
||||
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
||||
)
|
||||
|
||||
self.disable_hybrid_swa_memory = True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user