fix nz for quantization (#4943)

quantization ops rely on NZ by force, we should remove the nz check for it. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-12 14:54:41 +08:00
parent 5932abc446
commit 01a13a9b77
4 changed files with 11 additions and 14 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -270,9 +270,8 @@ class AscendW8A8DynamicFusedMoEMethod:
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
                1, 2).contiguous()
-        if is_enable_nz():
-            torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
-            torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
+        torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
+        torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
            layer.w13_weight_scale.data.shape[0], -1)
        layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(