[bugfix] fix w8a8dynamic fused_moe trans nz (#5199)

### What this PR does / why we need it? Currently, `torch_npu.npu_grouped_matmul_swiglu_quant` can only support weight nz, so we need to trans w13_weight, w2_weight to nz forcely. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-12-22 17:45:34 +08:00
parent 55beac9c91
commit 052e472453
2 changed files with 52 additions and 3 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -29,7 +29,7 @@ from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.flash_common3_context import get_flash_common3_context
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.utils import maybe_trans_nz
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, maybe_trans_nz


 class AscendW8A8DynamicLinearMethod:
@@ -276,8 +276,12 @@ class AscendW8A8DynamicFusedMoEMethod:
            1, 2).contiguous()
        layer.w2_weight.data = layer.w2_weight.data.transpose(1,
                                                              2).contiguous()
-        layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data)
-        layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data)
+        # TODO(zzzzwwjj): Currently, `torch_npu.npu_grouped_matmul_swiglu_quant`
+        # can only support weight nz.
+        layer.w13_weight.data = torch_npu.npu_format_cast(
+            layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
+        layer.w2_weight.data = torch_npu.npu_format_cast(
+            layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
            layer.w13_weight_scale.data.shape[0], -1)
        layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(