Support mxfp4 for GPT-OSS (#8843)
Co-authored-by: Co-author fzyzcjy <ch271828n@outlook.com> Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Co-authored-by: zhuofan1123 <zhuofanl@nvidia.com> Co-authored-by: liz-badada <jinyanc@nvidia.com> Co-authored-by: xutizhou <xutingz@nvidia.com> Co-authored-by: linhu-nv <linhu@nvidia.com>
This commit is contained in:
@@ -464,6 +464,16 @@ class ServerArgs:
|
||||
self.enable_triton_kernel_moe = True
|
||||
self.disable_hybrid_swa_memory = True
|
||||
|
||||
quantization_config = getattr(
|
||||
self.get_hf_config(), "quantization_config", None
|
||||
)
|
||||
if (
|
||||
quantization_config is not None
|
||||
and quantization_config.get("quant_method") == "mxfp4"
|
||||
):
|
||||
# use bf16 for mxfp4 triton kernels
|
||||
self.dtype = "bfloat16"
|
||||
|
||||
# Set page size
|
||||
if self.page_size is None:
|
||||
self.page_size = 1
|
||||
|
||||
Reference in New Issue
Block a user