Support mxfp4 for GPT-OSS (#8843)

Co-authored-by: Co-author fzyzcjy <ch271828n@outlook.com>
Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Co-authored-by: zhuofan1123 <zhuofanl@nvidia.com>
Co-authored-by: liz-badada <jinyanc@nvidia.com>
Co-authored-by: xutizhou <xutingz@nvidia.com>
Co-authored-by: linhu-nv <linhu@nvidia.com>
This commit is contained in:
Ying Sheng
2025-08-06 00:05:25 -07:00
committed by GitHub
parent cbbb738371
commit 168033d5fb
9 changed files with 791 additions and 325 deletions

View File

@@ -464,6 +464,16 @@ class ServerArgs:
self.enable_triton_kernel_moe = True
self.disable_hybrid_swa_memory = True
quantization_config = getattr(
self.get_hf_config(), "quantization_config", None
)
if (
quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4"
):
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
# Set page size
if self.page_size is None:
self.page_size = 1