Fix Llama 4 with MXFP4 dynamic quant on MI35x (#9993)
This commit is contained in:
@@ -816,7 +816,10 @@ class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase):
|
|||||||
moe_runner_config: MoeRunnerConfig,
|
moe_runner_config: MoeRunnerConfig,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
topk_weights, topk_ids, _ = topk_output
|
topk_weights, topk_ids, _ = topk_output
|
||||||
|
if _is_hip:
|
||||||
|
topk_weights = topk_weights.to(
|
||||||
|
torch.float32
|
||||||
|
) # aiter's moe_sorting requires topk_weights to be FP32
|
||||||
return fused_moe(
|
return fused_moe(
|
||||||
x,
|
x,
|
||||||
layer.w13_weight,
|
layer.w13_weight,
|
||||||
|
|||||||
@@ -2336,7 +2336,8 @@ class ServerArgs:
|
|||||||
assert self.attention_backend in {
|
assert self.attention_backend in {
|
||||||
"fa3",
|
"fa3",
|
||||||
"aiter",
|
"aiter",
|
||||||
}, "fa3 or aiter is required for Llama4 model"
|
"triton",
|
||||||
|
}, "fa3, aiter, or triton is required for Llama4 model"
|
||||||
elif model_arch in [
|
elif model_arch in [
|
||||||
"Gemma2ForCausalLM",
|
"Gemma2ForCausalLM",
|
||||||
"Gemma3ForCausalLM",
|
"Gemma3ForCausalLM",
|
||||||
|
|||||||
Reference in New Issue
Block a user