From de7649492ddcbdb7c818665f0b81cc8fbaaaa4b7 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 25 Aug 2025 19:48:55 +0800
Subject: [PATCH] [Refactor] cleanup converting_weight_acl_format_format
 (#2482)

move maybe_converting_weight_acl_format_format to torchair module, it's
only used with 310p+torchair

- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/49ab23b3ccc2da9274c739d55f9b19206078c7a9

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 tests/ut/test_utils.py                        | 73 -------------------
 tests/ut/torchair/test_utils.py               | 45 ++++++++++++
 vllm_ascend/torchair/torchair_model_runner.py | 19 ++---
 vllm_ascend/torchair/utils.py                 | 20 +++++
 vllm_ascend/utils.py                          | 23 ------
 vllm_ascend/worker/model_runner_v1.py         |  3 -
 6 files changed, 75 insertions(+), 108 deletions(-)

diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
index 46a3ca8..73eca32 100644
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -117,79 +117,6 @@ class TestUtils(TestBase):
         output_tensor = utils.aligned_16(input_tensor)
         self.assertEqual(output_tensor.shape[0], 32)
 
-    @mock.patch('torch_npu.get_npu_format')
-    @mock.patch('torch_npu.npu_format_cast')
-    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
-                new=mock.MagicMock)
-    @mock.patch('vllm_ascend.utils.is_310p')
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
-    def test_maybe_converting_weight_acl_format(self, mock_get_config,
-                                                mock_310p, mock_npu_cast,
-                                                mock_get_format):
-        ACL_FORMAT_FRACTAL_NZ = 29
-        mock_310p.return_value = True
-
-        mock_config = mock.MagicMock()
-        mock_config.torchair_graph_config.enabled = True
-        mock_get_config.return_value = mock_config
-        mock_get_format.return_value = 1
-
-        mock_npu_cast.return_value = 1
-
-        fused_moe = mock.MagicMock()
-        fused_moe.w13_weight = mock.MagicMock()
-        fused_moe.w2_weight = mock.MagicMock()
-        fused_moe.w13_weight.data = torch.randn(128, 256)
-        fused_moe.w2_weight.data = torch.randn(256, 128)
-        model = mock.MagicMock()
-        model.modules.return_value = [fused_moe]
-
-        utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
-        self.assertEqual(fused_moe.w13_weight.data, 1)
-
-    @mock.patch('torch_npu.get_npu_format')
-    @mock.patch('torch_npu.npu_format_cast')
-    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
-                new=mock.MagicMock)
-    @mock.patch('vllm_ascend.utils.is_310p')
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
-    def test_maybe_converting_weight_acl_format_format_true(
-            self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format):
-        ACL_FORMAT_FRACTAL_NZ = 29
-        mock_310p.return_value = True
-
-        mock_config = mock.MagicMock()
-        mock_config.torchair_graph_config.enabled = True
-        mock_get_config.return_value = mock_config
-        mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
-
-        mock_npu_cast.return_value = 1
-
-        fused_moe = mock.MagicMock()
-        fused_moe.w13_weight = mock.MagicMock()
-        fused_moe.w2_weight = mock.MagicMock()
-        fused_moe.w13_weight.data = torch.randn(128, 256)
-        fused_moe.w2_weight.data = torch.randn(256, 128)
-        model = mock.MagicMock()
-        model.modules.return_value = [fused_moe]
-
-        mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
-
-        utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
-
-    @mock.patch('vllm_ascend.utils.get_ascend_config')
-    @mock.patch('vllm_ascend.utils.is_310p', return_value=False)
-    def test_maybe_converting_weight_acl_format_not_310_not_graph(
-            self, mock_310p, mock_get_config):
-        mock_config = mock.MagicMock()
-        mock_config.torchair_graph_config.enabled = False
-        mock_get_config.return_value = mock_config
-
-        mock_constant = mock.MagicMock()
-
-        mock_model = mock.MagicMock()
-        utils.maybe_converting_weight_acl_format(mock_model, mock_constant)
-
     @mock.patch('importlib.util.find_spec')
     @mock.patch('importlib.import_module')
     def test_try_register_lib(self, mock_import_module, mock_find_spec):
diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py
index 1d65fd1..45416bd 100644
--- a/tests/ut/torchair/test_utils.py
+++ b/tests/ut/torchair/test_utils.py
@@ -1,7 +1,10 @@
 import os
 from concurrent.futures import ThreadPoolExecutor
+from unittest import mock
 from unittest.mock import MagicMock, patch
 
+import torch
+
 from tests.ut.base import TestBase
 from vllm_ascend.torchair import utils
 
@@ -75,3 +78,45 @@ class TestTorchairUtils(TestBase):
             args, kwargs = call_args_list[i]
             self.assertEqual(args[0], expected_name)
             self.assertEqual(args[1], expected_path)
+
+    @mock.patch('torch_npu.get_npu_format')
+    @mock.patch('torch_npu.npu_format_cast')
+    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
+                new=mock.MagicMock)
+    def test_converting_weight_acl_format(self, mock_npu_cast,
+                                          mock_get_format):
+        ACL_FORMAT_FRACTAL_NZ = 29
+        mock_get_format.return_value = 1
+        mock_npu_cast.return_value = 1
+
+        fused_moe = mock.MagicMock()
+        fused_moe.w13_weight = mock.MagicMock()
+        fused_moe.w2_weight = mock.MagicMock()
+        fused_moe.w13_weight.data = torch.randn(128, 256)
+        fused_moe.w2_weight.data = torch.randn(256, 128)
+        model = mock.MagicMock()
+        model.modules.return_value = [fused_moe]
+
+        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
+        self.assertEqual(fused_moe.w13_weight.data, 1)
+
+    @mock.patch('torch_npu.get_npu_format')
+    @mock.patch('torch_npu.npu_format_cast')
+    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
+                new=mock.MagicMock)
+    def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
+                                                      mock_get_format):
+        ACL_FORMAT_FRACTAL_NZ = 29
+        mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
+        mock_npu_cast.return_value = 1
+
+        fused_moe = mock.MagicMock()
+        fused_moe.w13_weight = mock.MagicMock()
+        fused_moe.w2_weight = mock.MagicMock()
+        fused_moe.w13_weight.data = torch.randn(128, 256)
+        fused_moe.w2_weight.data = torch.randn(256, 128)
+        model = mock.MagicMock()
+        model.modules.return_value = [fused_moe]
+
+        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
+        mock_npu_cast.assert_not_called()
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
index dce98d1..268e56e 100644
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -36,10 +36,11 @@ from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
                                         check_torchair_cache_exist,
+                                        converting_weight_acl_format,
                                         register_torchair_model,
                                         write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               is_310p, maybe_converting_weight_acl_format)
+                               is_310p)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 
@@ -136,9 +137,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
                     assert isinstance(kv, tuple), "kv_cache must be a tuple"
                     torch._dynamo.mark_static(kv[0])
                     torch._dynamo.mark_static(kv[1])
-
-            maybe_converting_weight_acl_format(self.model,
-                                               ACL_FORMAT_FRACTAL_NZ)
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
 
             compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
             model_kwargs = {}
@@ -152,6 +152,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
                 **model_kwargs,
             )
         else:
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
             hidden_states = super()._generate_dummy_run_hidden_states(
                 with_prefill, is_torchair_compile, input_ids, positions,
                 attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
@@ -261,9 +263,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
             "attn_metadata": attn_metadata
         }
         if not with_prefill:
-            maybe_converting_weight_acl_format(self.model,
-                                               ACL_FORMAT_FRACTAL_NZ)
-
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
             compiled_model = self._get_torchair_lazy_compiled_model(
                 padded_num_tokens_across_dp)
             hidden_states = compiled_model(
@@ -275,8 +276,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
             )
         else:
             assert self.model is not None
-            maybe_converting_weight_acl_format(self.model,
-                                               ACL_FORMAT_FRACTAL_ND)
+            if is_310p():
+                converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
 
             hidden_states = self.model(
                 input_ids=input_ids,
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
index abd5830..9d3254f 100644
--- a/vllm_ascend/torchair/utils.py
+++ b/vllm_ascend/torchair/utils.py
@@ -5,6 +5,7 @@ from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 
 import torch
+import torch_npu
 
 try:
     # Recent release of torchair has moved these ops to `.scope`.
@@ -125,6 +126,25 @@ def npu_wait_tensor(self: torch.Tensor,
     return _npu_wait_tensor(self, dependency) if enabled else self
 
 
+def converting_weight_acl_format(model, format):
+    # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
+    # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
+    # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
+    # conversion when using torchair graph mode on 300I Duo platform.
+    # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
+    # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+    for module in model.modules():
+        if isinstance(module, FusedMoE):
+            if torch_npu.get_npu_format(module.w13_weight.data) == format:
+                return
+            module.w13_weight.data = torch_npu.npu_format_cast(
+                module.w13_weight.data, format)
+            module.w2_weight.data = torch_npu.npu_format_cast(
+                module.w2_weight.data, format)
+
+
 def register_torchair_model():
     from vllm import ModelRegistry
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 0e799e6..b36875e 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -32,7 +32,6 @@ from torch_npu.npu.streams import Event
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.ascend_config import get_ascend_config
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -170,28 +169,6 @@ def aligned_16(tensor: torch.Tensor):
     return new_tensor
 
 
-def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
-    # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
-    # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
-    # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
-    # conversion when using torchair graph mode on 300I Duo platform.
-    # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
-    # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
-    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-
-    use_torchair = get_ascend_config().torchair_graph_config.enabled
-    if not is_310p() or not use_torchair:
-        return
-    for module in model.modules():
-        if isinstance(module, FusedMoE):
-            if torch_npu.get_npu_format(module.w13_weight.data) == format:
-                return
-            module.w13_weight.data = torch_npu.npu_format_cast(
-                module.w13_weight.data, format)
-            module.w2_weight.data = torch_npu.npu_format_cast(
-                module.w2_weight.data, format)
-
-
 def try_register_lib(lib_name: str, lib_info: str = ""):
     import importlib
     import importlib.util
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 859f21e..82b4996 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -95,7 +95,6 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
 from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                ProfileExecuteDuration, is_310p,
-                               maybe_converting_weight_acl_format,
                                vllm_version_is)
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1265,7 +1264,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                              intermediate_tensors,
                                              inputs_embeds):
         assert self.model is not None
-        maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -1880,7 +1878,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                           is_torchair_compile, input_ids,
                                           positions, attn_metadata, num_tokens,
                                           intermediate_tensors, inputs_embeds):
-        maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
         hidden_states = self.model(input_ids=input_ids,
                                    positions=positions,
                                    intermediate_tensors=intermediate_tensors,