From de7649492ddcbdb7c818665f0b81cc8fbaaaa4b7 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Mon, 25 Aug 2025 19:48:55 +0800 Subject: [PATCH] [Refactor] cleanup converting_weight_acl_format_format (#2482) move maybe_converting_weight_acl_format_format to torchair module, it's only used with 310p+torchair - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/49ab23b3ccc2da9274c739d55f9b19206078c7a9 Signed-off-by: wangxiyuan --- tests/ut/test_utils.py | 73 ------------------- tests/ut/torchair/test_utils.py | 45 ++++++++++++ vllm_ascend/torchair/torchair_model_runner.py | 19 ++--- vllm_ascend/torchair/utils.py | 20 +++++ vllm_ascend/utils.py | 23 ------ vllm_ascend/worker/model_runner_v1.py | 3 - 6 files changed, 75 insertions(+), 108 deletions(-) diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 46a3ca8..73eca32 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -117,79 +117,6 @@ class TestUtils(TestBase): output_tensor = utils.aligned_16(input_tensor) self.assertEqual(output_tensor.shape[0], 32) - @mock.patch('torch_npu.get_npu_format') - @mock.patch('torch_npu.npu_format_cast') - @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', - new=mock.MagicMock) - @mock.patch('vllm_ascend.utils.is_310p') - @mock.patch('vllm_ascend.utils.get_ascend_config') - def test_maybe_converting_weight_acl_format(self, mock_get_config, - mock_310p, mock_npu_cast, - mock_get_format): - ACL_FORMAT_FRACTAL_NZ = 29 - mock_310p.return_value = True - - mock_config = mock.MagicMock() - mock_config.torchair_graph_config.enabled = True - mock_get_config.return_value = mock_config - mock_get_format.return_value = 1 - - mock_npu_cast.return_value = 1 - - fused_moe = mock.MagicMock() - fused_moe.w13_weight = mock.MagicMock() - fused_moe.w2_weight = mock.MagicMock() - fused_moe.w13_weight.data = torch.randn(128, 256) - fused_moe.w2_weight.data = torch.randn(256, 128) - model = mock.MagicMock() - model.modules.return_value = [fused_moe] - - utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) - self.assertEqual(fused_moe.w13_weight.data, 1) - - @mock.patch('torch_npu.get_npu_format') - @mock.patch('torch_npu.npu_format_cast') - @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', - new=mock.MagicMock) - @mock.patch('vllm_ascend.utils.is_310p') - @mock.patch('vllm_ascend.utils.get_ascend_config') - def test_maybe_converting_weight_acl_format_format_true( - self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format): - ACL_FORMAT_FRACTAL_NZ = 29 - mock_310p.return_value = True - - mock_config = mock.MagicMock() - mock_config.torchair_graph_config.enabled = True - mock_get_config.return_value = mock_config - mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ - - mock_npu_cast.return_value = 1 - - fused_moe = mock.MagicMock() - fused_moe.w13_weight = mock.MagicMock() - fused_moe.w2_weight = mock.MagicMock() - fused_moe.w13_weight.data = torch.randn(128, 256) - fused_moe.w2_weight.data = torch.randn(256, 128) - model = mock.MagicMock() - model.modules.return_value = [fused_moe] - - mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ - - utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) - - @mock.patch('vllm_ascend.utils.get_ascend_config') - @mock.patch('vllm_ascend.utils.is_310p', return_value=False) - def test_maybe_converting_weight_acl_format_not_310_not_graph( - self, mock_310p, mock_get_config): - mock_config = mock.MagicMock() - mock_config.torchair_graph_config.enabled = False - mock_get_config.return_value = mock_config - - mock_constant = mock.MagicMock() - - mock_model = mock.MagicMock() - utils.maybe_converting_weight_acl_format(mock_model, mock_constant) - @mock.patch('importlib.util.find_spec') @mock.patch('importlib.import_module') def test_try_register_lib(self, mock_import_module, mock_find_spec): diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py index 1d65fd1..45416bd 100644 --- a/tests/ut/torchair/test_utils.py +++ b/tests/ut/torchair/test_utils.py @@ -1,7 +1,10 @@ import os from concurrent.futures import ThreadPoolExecutor +from unittest import mock from unittest.mock import MagicMock, patch +import torch + from tests.ut.base import TestBase from vllm_ascend.torchair import utils @@ -75,3 +78,45 @@ class TestTorchairUtils(TestBase): args, kwargs = call_args_list[i] self.assertEqual(args[0], expected_name) self.assertEqual(args[1], expected_path) + + @mock.patch('torch_npu.get_npu_format') + @mock.patch('torch_npu.npu_format_cast') + @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', + new=mock.MagicMock) + def test_converting_weight_acl_format(self, mock_npu_cast, + mock_get_format): + ACL_FORMAT_FRACTAL_NZ = 29 + mock_get_format.return_value = 1 + mock_npu_cast.return_value = 1 + + fused_moe = mock.MagicMock() + fused_moe.w13_weight = mock.MagicMock() + fused_moe.w2_weight = mock.MagicMock() + fused_moe.w13_weight.data = torch.randn(128, 256) + fused_moe.w2_weight.data = torch.randn(256, 128) + model = mock.MagicMock() + model.modules.return_value = [fused_moe] + + utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) + self.assertEqual(fused_moe.w13_weight.data, 1) + + @mock.patch('torch_npu.get_npu_format') + @mock.patch('torch_npu.npu_format_cast') + @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', + new=mock.MagicMock) + def test_converting_weight_acl_format_format_true(self, mock_npu_cast, + mock_get_format): + ACL_FORMAT_FRACTAL_NZ = 29 + mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ + mock_npu_cast.return_value = 1 + + fused_moe = mock.MagicMock() + fused_moe.w13_weight = mock.MagicMock() + fused_moe.w2_weight = mock.MagicMock() + fused_moe.w13_weight.data = torch.randn(128, 256) + fused_moe.w2_weight.data = torch.randn(256, 128) + model = mock.MagicMock() + model.modules.return_value = [fused_moe] + + utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) + mock_npu_cast.assert_not_called() diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index dce98d1..268e56e 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -36,10 +36,11 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.platform import NPUPlatform from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata, check_torchair_cache_exist, + converting_weight_acl_format, register_torchair_model, write_kv_cache_bytes_to_file) from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, - is_310p, maybe_converting_weight_acl_format) + is_310p) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -136,9 +137,8 @@ class NPUTorchairModelRunner(NPUModelRunner): assert isinstance(kv, tuple), "kv_cache must be a tuple" torch._dynamo.mark_static(kv[0]) torch._dynamo.mark_static(kv[1]) - - maybe_converting_weight_acl_format(self.model, - ACL_FORMAT_FRACTAL_NZ) + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) compiled_model = self._get_torchair_lazy_compiled_model(num_tokens) model_kwargs = {} @@ -152,6 +152,8 @@ class NPUTorchairModelRunner(NPUModelRunner): **model_kwargs, ) else: + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = super()._generate_dummy_run_hidden_states( with_prefill, is_torchair_compile, input_ids, positions, attn_metadata, num_tokens, intermediate_tensors, inputs_embeds) @@ -261,9 +263,8 @@ class NPUTorchairModelRunner(NPUModelRunner): "attn_metadata": attn_metadata } if not with_prefill: - maybe_converting_weight_acl_format(self.model, - ACL_FORMAT_FRACTAL_NZ) - + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) compiled_model = self._get_torchair_lazy_compiled_model( padded_num_tokens_across_dp) hidden_states = compiled_model( @@ -275,8 +276,8 @@ class NPUTorchairModelRunner(NPUModelRunner): ) else: assert self.model is not None - maybe_converting_weight_acl_format(self.model, - ACL_FORMAT_FRACTAL_ND) + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = self.model( input_ids=input_ids, diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py index abd5830..9d3254f 100644 --- a/vllm_ascend/torchair/utils.py +++ b/vllm_ascend/torchair/utils.py @@ -5,6 +5,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass import torch +import torch_npu try: # Recent release of torchair has moved these ops to `.scope`. @@ -125,6 +126,25 @@ def npu_wait_tensor(self: torch.Tensor, return _npu_wait_tensor(self, dependency) if enabled else self +def converting_weight_acl_format(model, format): + # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ + # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ + # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this + # conversion when using torchair graph mode on 300I Duo platform. + # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant + # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode. + from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + for module in model.modules(): + if isinstance(module, FusedMoE): + if torch_npu.get_npu_format(module.w13_weight.data) == format: + return + module.w13_weight.data = torch_npu.npu_format_cast( + module.w13_weight.data, format) + module.w2_weight.data = torch_npu.npu_format_cast( + module.w2_weight.data, format) + + def register_torchair_model(): from vllm import ModelRegistry diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 0e799e6..b36875e 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -32,7 +32,6 @@ from torch_npu.npu.streams import Event from vllm.logger import logger import vllm_ascend.envs as envs_ascend -from vllm_ascend.ascend_config import get_ascend_config if TYPE_CHECKING: from vllm.config import VllmConfig @@ -170,28 +169,6 @@ def aligned_16(tensor: torch.Tensor): return new_tensor -def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ): - # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ - # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ - # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this - # conversion when using torchair graph mode on 300I Duo platform. - # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant - # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode. - from vllm.model_executor.layers.fused_moe.layer import FusedMoE - - use_torchair = get_ascend_config().torchair_graph_config.enabled - if not is_310p() or not use_torchair: - return - for module in model.modules(): - if isinstance(module, FusedMoE): - if torch_npu.get_npu_format(module.w13_weight.data) == format: - return - module.w13_weight.data = torch_npu.npu_format_cast( - module.w13_weight.data, format) - module.w2_weight.data = torch_npu.npu_format_cast( - module.w2_weight.data, format) - - def try_register_lib(lib_name: str, lib_info: str = ""): import importlib import importlib.util diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 859f21e..82b4996 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -95,7 +95,6 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, ProfileExecuteDuration, is_310p, - maybe_converting_weight_acl_format, vllm_version_is) from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer @@ -1265,7 +1264,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): intermediate_tensors, inputs_embeds): assert self.model is not None - maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = self.model( input_ids=input_ids, positions=positions, @@ -1880,7 +1878,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): is_torchair_compile, input_ids, positions, attn_metadata, num_tokens, intermediate_tensors, inputs_embeds): - maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) hidden_states = self.model(input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors,