From 2c562fd2d056fe37dff0682e45078539b0aaf354 Mon Sep 17 00:00:00 2001 From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com> Date: Thu, 4 Sep 2025 00:48:58 -0700 Subject: [PATCH] Fix Llama 4 with MXFP4 dynamic quant on MI35x (#9993) --- python/sglang/srt/layers/quantization/mxfp4.py | 5 ++++- python/sglang/srt/server_args.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py index c353cbba3..8180fb8b9 100644 --- a/python/sglang/srt/layers/quantization/mxfp4.py +++ b/python/sglang/srt/layers/quantization/mxfp4.py @@ -816,7 +816,10 @@ class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase): moe_runner_config: MoeRunnerConfig, ) -> torch.Tensor: topk_weights, topk_ids, _ = topk_output - + if _is_hip: + topk_weights = topk_weights.to( + torch.float32 + ) # aiter's moe_sorting requires topk_weights to be FP32 return fused_moe( x, layer.w13_weight, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index c6255223d..86b0f1c18 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2336,7 +2336,8 @@ class ServerArgs: assert self.attention_backend in { "fa3", "aiter", - }, "fa3 or aiter is required for Llama4 model" + "triton", + }, "fa3, aiter, or triton is required for Llama4 model" elif model_arch in [ "Gemma2ForCausalLM", "Gemma3ForCausalLM",