[Feat] Unquantized Linear to nz and control all nz-cast (#3356)

### What this PR does / why we need it? Currently, when executing to the Linear layer of models in vLLM-Ascend, the weights format is ND in unquantized case and skipped ascend case. This PR supplements the execution logic for Linear layer. We use a new global variable: VLLM_ASCEND_ENABLE_NZ. When VLLM_ASCEND_ENABLE_NZ=1 and CANN version is 8.3, the weights of the Linear layer will be converted to FRACTAL_NZ, in both unquantized case and skipped ascend case. We also use VLLM_ASCEND_ENABLE_NZ to control the existing NZ conversion, such as w8a8-quantized case. ### Does this PR introduce _any_ user-facing change? Add a new global variable VLLM_ASCEND_ENABLE_NZ. If you want to use NZ format, you should set VLLM_ASCEND_ENABLE_NZ=1. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-10-14 17:39:26 +08:00
parent 5c45c227dc
commit 07e39620ea
22 changed files with 413 additions and 49 deletions
--- a/tests/ut/torchair/test_utils.py
+++ b/tests/ut/torchair/test_utils.py
@@ -96,15 +96,17 @@ class TestTorchairUtils(TestBase):
            self.assertEqual(args[0], expected_name)
            self.assertEqual(args[1], expected_path)

+    @mock.patch('vllm_ascend.torchair.utils.is_enable_nz')
    @mock.patch('torch_npu.get_npu_format')
    @mock.patch('torch_npu.npu_format_cast')
    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
                new=mock.MagicMock)
-    def test_converting_weight_acl_format(self, mock_npu_cast,
-                                          mock_get_format):
+    def test_converting_weight_acl_format_to_nz(self, mock_npu_cast,
+                                                mock_get_format, mock_is_nz):
        ACL_FORMAT_FRACTAL_NZ = 29
        mock_get_format.return_value = 1
        mock_npu_cast.return_value = 1
+        mock_is_nz.return_value = 1

        fused_moe = mock.MagicMock()
        fused_moe.w13_weight = mock.MagicMock()
@@ -137,3 +139,26 @@ class TestTorchairUtils(TestBase):

        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
        mock_npu_cast.assert_not_called()
+
+    @mock.patch('vllm_ascend.torchair.utils.is_enable_nz')
+    @mock.patch('torch_npu.get_npu_format')
+    @mock.patch('torch_npu.npu_format_cast')
+    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
+                new=mock.MagicMock)
+    def test_converting_weight_acl_format_no_nz(self, mock_npu_cast,
+                                                mock_get_format, mock_is_nz):
+        ACL_FORMAT_FRACTAL_NZ = 29
+        mock_get_format.return_value = 1
+        mock_npu_cast.return_value = 1
+        mock_is_nz.return_value = 0
+
+        fused_moe = mock.MagicMock()
+        fused_moe.w13_weight = mock.MagicMock()
+        fused_moe.w2_weight = mock.MagicMock()
+        fused_moe.w13_weight.data = torch.randn(128, 256)
+        fused_moe.w2_weight.data = torch.randn(256, 128)
+        model = mock.MagicMock()
+        model.modules.return_value = [fused_moe]
+
+        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
+        mock_npu_cast.assert_not_called()