Fix Llama 4 with MXFP4 dynamic quant on MI35x (#9993)

2025-09-04 00:48:58 -07:00
parent b648d86216
commit 2c562fd2d0
2 changed files with 6 additions and 2 deletions
--- a/python/sglang/srt/layers/quantization/mxfp4.py
+++ b/python/sglang/srt/layers/quantization/mxfp4.py
@@ -816,7 +816,10 @@ class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase):
        moe_runner_config: MoeRunnerConfig,
    ) -> torch.Tensor:
        topk_weights, topk_ids, _ = topk_output
-
+        if _is_hip:
+            topk_weights = topk_weights.to(
+                torch.float32
+            )  # aiter's moe_sorting requires topk_weights to be FP32
        return fused_moe(
            x,
            layer.w13_weight,