[Refactor] cleanup converting_weight_acl_format_format (#2482)

move maybe_converting_weight_acl_format_format to torchair module, it's
only used with 310p+torchair

- vLLM version: v0.10.1.1
- vLLM main:
49ab23b3cc

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-08-25 19:48:55 +08:00
committed by GitHub
parent 0f81e032f0
commit de7649492d
6 changed files with 75 additions and 108 deletions

View File

@@ -117,79 +117,6 @@ class TestUtils(TestBase):
output_tensor = utils.aligned_16(input_tensor) output_tensor = utils.aligned_16(input_tensor)
self.assertEqual(output_tensor.shape[0], 32) self.assertEqual(output_tensor.shape[0], 32)
@mock.patch('torch_npu.get_npu_format')
@mock.patch('torch_npu.npu_format_cast')
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
@mock.patch('vllm_ascend.utils.is_310p')
@mock.patch('vllm_ascend.utils.get_ascend_config')
def test_maybe_converting_weight_acl_format(self, mock_get_config,
mock_310p, mock_npu_cast,
mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_310p.return_value = True
mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = True
mock_get_config.return_value = mock_config
mock_get_format.return_value = 1
mock_npu_cast.return_value = 1
fused_moe = mock.MagicMock()
fused_moe.w13_weight = mock.MagicMock()
fused_moe.w2_weight = mock.MagicMock()
fused_moe.w13_weight.data = torch.randn(128, 256)
fused_moe.w2_weight.data = torch.randn(256, 128)
model = mock.MagicMock()
model.modules.return_value = [fused_moe]
utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
self.assertEqual(fused_moe.w13_weight.data, 1)
@mock.patch('torch_npu.get_npu_format')
@mock.patch('torch_npu.npu_format_cast')
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
@mock.patch('vllm_ascend.utils.is_310p')
@mock.patch('vllm_ascend.utils.get_ascend_config')
def test_maybe_converting_weight_acl_format_format_true(
self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_310p.return_value = True
mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = True
mock_get_config.return_value = mock_config
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
mock_npu_cast.return_value = 1
fused_moe = mock.MagicMock()
fused_moe.w13_weight = mock.MagicMock()
fused_moe.w2_weight = mock.MagicMock()
fused_moe.w13_weight.data = torch.randn(128, 256)
fused_moe.w2_weight.data = torch.randn(256, 128)
model = mock.MagicMock()
model.modules.return_value = [fused_moe]
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
@mock.patch('vllm_ascend.utils.get_ascend_config')
@mock.patch('vllm_ascend.utils.is_310p', return_value=False)
def test_maybe_converting_weight_acl_format_not_310_not_graph(
self, mock_310p, mock_get_config):
mock_config = mock.MagicMock()
mock_config.torchair_graph_config.enabled = False
mock_get_config.return_value = mock_config
mock_constant = mock.MagicMock()
mock_model = mock.MagicMock()
utils.maybe_converting_weight_acl_format(mock_model, mock_constant)
@mock.patch('importlib.util.find_spec') @mock.patch('importlib.util.find_spec')
@mock.patch('importlib.import_module') @mock.patch('importlib.import_module')
def test_try_register_lib(self, mock_import_module, mock_find_spec): def test_try_register_lib(self, mock_import_module, mock_find_spec):

View File

@@ -1,7 +1,10 @@
import os import os
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from unittest import mock
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import torch
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.torchair import utils from vllm_ascend.torchair import utils
@@ -75,3 +78,45 @@ class TestTorchairUtils(TestBase):
args, kwargs = call_args_list[i] args, kwargs = call_args_list[i]
self.assertEqual(args[0], expected_name) self.assertEqual(args[0], expected_name)
self.assertEqual(args[1], expected_path) self.assertEqual(args[1], expected_path)
@mock.patch('torch_npu.get_npu_format')
@mock.patch('torch_npu.npu_format_cast')
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
def test_converting_weight_acl_format(self, mock_npu_cast,
mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_get_format.return_value = 1
mock_npu_cast.return_value = 1
fused_moe = mock.MagicMock()
fused_moe.w13_weight = mock.MagicMock()
fused_moe.w2_weight = mock.MagicMock()
fused_moe.w13_weight.data = torch.randn(128, 256)
fused_moe.w2_weight.data = torch.randn(256, 128)
model = mock.MagicMock()
model.modules.return_value = [fused_moe]
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
self.assertEqual(fused_moe.w13_weight.data, 1)
@mock.patch('torch_npu.get_npu_format')
@mock.patch('torch_npu.npu_format_cast')
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
new=mock.MagicMock)
def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
mock_get_format):
ACL_FORMAT_FRACTAL_NZ = 29
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
mock_npu_cast.return_value = 1
fused_moe = mock.MagicMock()
fused_moe.w13_weight = mock.MagicMock()
fused_moe.w2_weight = mock.MagicMock()
fused_moe.w13_weight.data = torch.randn(128, 256)
fused_moe.w2_weight.data = torch.randn(256, 128)
model = mock.MagicMock()
model.modules.return_value = [fused_moe]
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
mock_npu_cast.assert_not_called()

View File

@@ -36,10 +36,11 @@ from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.platform import NPUPlatform from vllm_ascend.platform import NPUPlatform
from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata, from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
check_torchair_cache_exist, check_torchair_cache_exist,
converting_weight_acl_format,
register_torchair_model, register_torchair_model,
write_kv_cache_bytes_to_file) write_kv_cache_bytes_to_file)
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
is_310p, maybe_converting_weight_acl_format) is_310p)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
@@ -136,9 +137,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
assert isinstance(kv, tuple), "kv_cache must be a tuple" assert isinstance(kv, tuple), "kv_cache must be a tuple"
torch._dynamo.mark_static(kv[0]) torch._dynamo.mark_static(kv[0])
torch._dynamo.mark_static(kv[1]) torch._dynamo.mark_static(kv[1])
if is_310p():
maybe_converting_weight_acl_format(self.model, converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(num_tokens) compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
model_kwargs = {} model_kwargs = {}
@@ -152,6 +152,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
**model_kwargs, **model_kwargs,
) )
else: else:
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = super()._generate_dummy_run_hidden_states( hidden_states = super()._generate_dummy_run_hidden_states(
with_prefill, is_torchair_compile, input_ids, positions, with_prefill, is_torchair_compile, input_ids, positions,
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds) attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
@@ -261,9 +263,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
"attn_metadata": attn_metadata "attn_metadata": attn_metadata
} }
if not with_prefill: if not with_prefill:
maybe_converting_weight_acl_format(self.model, if is_310p():
ACL_FORMAT_FRACTAL_NZ) converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model( compiled_model = self._get_torchair_lazy_compiled_model(
padded_num_tokens_across_dp) padded_num_tokens_across_dp)
hidden_states = compiled_model( hidden_states = compiled_model(
@@ -275,8 +276,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
) )
else: else:
assert self.model is not None assert self.model is not None
maybe_converting_weight_acl_format(self.model, if is_310p():
ACL_FORMAT_FRACTAL_ND) converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model( hidden_states = self.model(
input_ids=input_ids, input_ids=input_ids,

View File

@@ -5,6 +5,7 @@ from contextlib import contextmanager, nullcontext
from dataclasses import dataclass from dataclasses import dataclass
import torch import torch
import torch_npu
try: try:
# Recent release of torchair has moved these ops to `.scope`. # Recent release of torchair has moved these ops to `.scope`.
@@ -125,6 +126,25 @@ def npu_wait_tensor(self: torch.Tensor,
return _npu_wait_tensor(self, dependency) if enabled else self return _npu_wait_tensor(self, dependency) if enabled else self
def converting_weight_acl_format(model, format):
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
for module in model.modules():
if isinstance(module, FusedMoE):
if torch_npu.get_npu_format(module.w13_weight.data) == format:
return
module.w13_weight.data = torch_npu.npu_format_cast(
module.w13_weight.data, format)
module.w2_weight.data = torch_npu.npu_format_cast(
module.w2_weight.data, format)
def register_torchair_model(): def register_torchair_model():
from vllm import ModelRegistry from vllm import ModelRegistry

View File

@@ -32,7 +32,6 @@ from torch_npu.npu.streams import Event
from vllm.logger import logger from vllm.logger import logger
import vllm_ascend.envs as envs_ascend import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
@@ -170,28 +169,6 @@ def aligned_16(tensor: torch.Tensor):
return new_tensor return new_tensor
def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
use_torchair = get_ascend_config().torchair_graph_config.enabled
if not is_310p() or not use_torchair:
return
for module in model.modules():
if isinstance(module, FusedMoE):
if torch_npu.get_npu_format(module.w13_weight.data) == format:
return
module.w13_weight.data = torch_npu.npu_format_cast(
module.w13_weight.data, format)
module.w2_weight.data = torch_npu.npu_format_cast(
module.w2_weight.data, format)
def try_register_lib(lib_name: str, lib_info: str = ""): def try_register_lib(lib_name: str, lib_info: str = ""):
import importlib import importlib
import importlib.util import importlib.util

View File

@@ -95,7 +95,6 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p, ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format,
vllm_version_is) vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1265,7 +1264,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
intermediate_tensors, intermediate_tensors,
inputs_embeds): inputs_embeds):
assert self.model is not None assert self.model is not None
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model( hidden_states = self.model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
@@ -1880,7 +1878,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is_torchair_compile, input_ids, is_torchair_compile, input_ids,
positions, attn_metadata, num_tokens, positions, attn_metadata, num_tokens,
intermediate_tensors, inputs_embeds): intermediate_tensors, inputs_embeds):
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(input_ids=input_ids, hidden_states = self.model(input_ids=input_ids,
positions=positions, positions=positions,
intermediate_tensors=intermediate_tensors, intermediate_tensors=intermediate_tensors,