fix nz for quantization (#4943)

quantization ops rely on NZ by force, we should remove the nz check for it. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-12 14:54:41 +08:00
parent 5932abc446
commit 01a13a9b77
4 changed files with 11 additions and 14 deletions
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -347,7 +347,7 @@ class AscendW8A8FusedMoEMethod:
        # converting ACL_FORMAT_FRACTAL_NZ.
        # npu_quant_grouped_matmul_dequant in eager mode does not accept
        # ACL_FORMAT_FRACTAL_NZ.
-        if not is_310p() and is_enable_nz():
+        if not is_310p():
            layer.w13_weight.data = torch_npu.npu_format_cast(
                layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
            layer.w2_weight.data = torch_npu.npu_format_cast(