Fix the global scale fix does not support EPLB and improve enabling condition (#10369)

2025-09-14 16:07:47 +08:00
parent abea9250da
commit 2df532ef20
2 changed files with 8 additions and 10 deletions
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -999,12 +999,14 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
            data=torch.empty(layer.num_experts, 2, dtype=torch.float32),
            weight_loader=weight_loader,
        )
+        w13_input_scale._sglang_require_global_experts = True
        layer.register_parameter("w13_input_scale", w13_input_scale)

        w2_input_scale = PerTensorScaleParameter(
            data=torch.empty(layer.num_experts, dtype=torch.float32),
            weight_loader=weight_loader,
        )
+        w2_input_scale._sglang_require_global_experts = True
        layer.register_parameter("w2_input_scale", w2_input_scale)

    def swizzle_blockscale(self, scale: torch.Tensor):