[Refactor] cleanup converting_weight_acl_format_format (#2482)
move maybe_converting_weight_acl_format_format to torchair module, it's
only used with 310p+torchair
- vLLM version: v0.10.1.1
- vLLM main:
49ab23b3cc
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -117,79 +117,6 @@ class TestUtils(TestBase):
|
|||||||
output_tensor = utils.aligned_16(input_tensor)
|
output_tensor = utils.aligned_16(input_tensor)
|
||||||
self.assertEqual(output_tensor.shape[0], 32)
|
self.assertEqual(output_tensor.shape[0], 32)
|
||||||
|
|
||||||
@mock.patch('torch_npu.get_npu_format')
|
|
||||||
@mock.patch('torch_npu.npu_format_cast')
|
|
||||||
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
|
|
||||||
new=mock.MagicMock)
|
|
||||||
@mock.patch('vllm_ascend.utils.is_310p')
|
|
||||||
@mock.patch('vllm_ascend.utils.get_ascend_config')
|
|
||||||
def test_maybe_converting_weight_acl_format(self, mock_get_config,
|
|
||||||
mock_310p, mock_npu_cast,
|
|
||||||
mock_get_format):
|
|
||||||
ACL_FORMAT_FRACTAL_NZ = 29
|
|
||||||
mock_310p.return_value = True
|
|
||||||
|
|
||||||
mock_config = mock.MagicMock()
|
|
||||||
mock_config.torchair_graph_config.enabled = True
|
|
||||||
mock_get_config.return_value = mock_config
|
|
||||||
mock_get_format.return_value = 1
|
|
||||||
|
|
||||||
mock_npu_cast.return_value = 1
|
|
||||||
|
|
||||||
fused_moe = mock.MagicMock()
|
|
||||||
fused_moe.w13_weight = mock.MagicMock()
|
|
||||||
fused_moe.w2_weight = mock.MagicMock()
|
|
||||||
fused_moe.w13_weight.data = torch.randn(128, 256)
|
|
||||||
fused_moe.w2_weight.data = torch.randn(256, 128)
|
|
||||||
model = mock.MagicMock()
|
|
||||||
model.modules.return_value = [fused_moe]
|
|
||||||
|
|
||||||
utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
|
|
||||||
self.assertEqual(fused_moe.w13_weight.data, 1)
|
|
||||||
|
|
||||||
@mock.patch('torch_npu.get_npu_format')
|
|
||||||
@mock.patch('torch_npu.npu_format_cast')
|
|
||||||
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
|
|
||||||
new=mock.MagicMock)
|
|
||||||
@mock.patch('vllm_ascend.utils.is_310p')
|
|
||||||
@mock.patch('vllm_ascend.utils.get_ascend_config')
|
|
||||||
def test_maybe_converting_weight_acl_format_format_true(
|
|
||||||
self, mock_get_config, mock_310p, mock_npu_cast, mock_get_format):
|
|
||||||
ACL_FORMAT_FRACTAL_NZ = 29
|
|
||||||
mock_310p.return_value = True
|
|
||||||
|
|
||||||
mock_config = mock.MagicMock()
|
|
||||||
mock_config.torchair_graph_config.enabled = True
|
|
||||||
mock_get_config.return_value = mock_config
|
|
||||||
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
|
|
||||||
|
|
||||||
mock_npu_cast.return_value = 1
|
|
||||||
|
|
||||||
fused_moe = mock.MagicMock()
|
|
||||||
fused_moe.w13_weight = mock.MagicMock()
|
|
||||||
fused_moe.w2_weight = mock.MagicMock()
|
|
||||||
fused_moe.w13_weight.data = torch.randn(128, 256)
|
|
||||||
fused_moe.w2_weight.data = torch.randn(256, 128)
|
|
||||||
model = mock.MagicMock()
|
|
||||||
model.modules.return_value = [fused_moe]
|
|
||||||
|
|
||||||
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
|
|
||||||
|
|
||||||
utils.maybe_converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
|
|
||||||
|
|
||||||
@mock.patch('vllm_ascend.utils.get_ascend_config')
|
|
||||||
@mock.patch('vllm_ascend.utils.is_310p', return_value=False)
|
|
||||||
def test_maybe_converting_weight_acl_format_not_310_not_graph(
|
|
||||||
self, mock_310p, mock_get_config):
|
|
||||||
mock_config = mock.MagicMock()
|
|
||||||
mock_config.torchair_graph_config.enabled = False
|
|
||||||
mock_get_config.return_value = mock_config
|
|
||||||
|
|
||||||
mock_constant = mock.MagicMock()
|
|
||||||
|
|
||||||
mock_model = mock.MagicMock()
|
|
||||||
utils.maybe_converting_weight_acl_format(mock_model, mock_constant)
|
|
||||||
|
|
||||||
@mock.patch('importlib.util.find_spec')
|
@mock.patch('importlib.util.find_spec')
|
||||||
@mock.patch('importlib.import_module')
|
@mock.patch('importlib.import_module')
|
||||||
def test_try_register_lib(self, mock_import_module, mock_find_spec):
|
def test_try_register_lib(self, mock_import_module, mock_find_spec):
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from unittest import mock
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
from tests.ut.base import TestBase
|
from tests.ut.base import TestBase
|
||||||
from vllm_ascend.torchair import utils
|
from vllm_ascend.torchair import utils
|
||||||
|
|
||||||
@@ -75,3 +78,45 @@ class TestTorchairUtils(TestBase):
|
|||||||
args, kwargs = call_args_list[i]
|
args, kwargs = call_args_list[i]
|
||||||
self.assertEqual(args[0], expected_name)
|
self.assertEqual(args[0], expected_name)
|
||||||
self.assertEqual(args[1], expected_path)
|
self.assertEqual(args[1], expected_path)
|
||||||
|
|
||||||
|
@mock.patch('torch_npu.get_npu_format')
|
||||||
|
@mock.patch('torch_npu.npu_format_cast')
|
||||||
|
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
|
||||||
|
new=mock.MagicMock)
|
||||||
|
def test_converting_weight_acl_format(self, mock_npu_cast,
|
||||||
|
mock_get_format):
|
||||||
|
ACL_FORMAT_FRACTAL_NZ = 29
|
||||||
|
mock_get_format.return_value = 1
|
||||||
|
mock_npu_cast.return_value = 1
|
||||||
|
|
||||||
|
fused_moe = mock.MagicMock()
|
||||||
|
fused_moe.w13_weight = mock.MagicMock()
|
||||||
|
fused_moe.w2_weight = mock.MagicMock()
|
||||||
|
fused_moe.w13_weight.data = torch.randn(128, 256)
|
||||||
|
fused_moe.w2_weight.data = torch.randn(256, 128)
|
||||||
|
model = mock.MagicMock()
|
||||||
|
model.modules.return_value = [fused_moe]
|
||||||
|
|
||||||
|
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
|
||||||
|
self.assertEqual(fused_moe.w13_weight.data, 1)
|
||||||
|
|
||||||
|
@mock.patch('torch_npu.get_npu_format')
|
||||||
|
@mock.patch('torch_npu.npu_format_cast')
|
||||||
|
@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
|
||||||
|
new=mock.MagicMock)
|
||||||
|
def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
|
||||||
|
mock_get_format):
|
||||||
|
ACL_FORMAT_FRACTAL_NZ = 29
|
||||||
|
mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
|
||||||
|
mock_npu_cast.return_value = 1
|
||||||
|
|
||||||
|
fused_moe = mock.MagicMock()
|
||||||
|
fused_moe.w13_weight = mock.MagicMock()
|
||||||
|
fused_moe.w2_weight = mock.MagicMock()
|
||||||
|
fused_moe.w13_weight.data = torch.randn(128, 256)
|
||||||
|
fused_moe.w2_weight.data = torch.randn(256, 128)
|
||||||
|
model = mock.MagicMock()
|
||||||
|
model.modules.return_value = [fused_moe]
|
||||||
|
|
||||||
|
utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
|
||||||
|
mock_npu_cast.assert_not_called()
|
||||||
|
|||||||
@@ -36,10 +36,11 @@ from vllm_ascend.ascend_config import get_ascend_config
|
|||||||
from vllm_ascend.platform import NPUPlatform
|
from vllm_ascend.platform import NPUPlatform
|
||||||
from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
|
from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
|
||||||
check_torchair_cache_exist,
|
check_torchair_cache_exist,
|
||||||
|
converting_weight_acl_format,
|
||||||
register_torchair_model,
|
register_torchair_model,
|
||||||
write_kv_cache_bytes_to_file)
|
write_kv_cache_bytes_to_file)
|
||||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||||
is_310p, maybe_converting_weight_acl_format)
|
is_310p)
|
||||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||||
|
|
||||||
|
|
||||||
@@ -136,9 +137,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
assert isinstance(kv, tuple), "kv_cache must be a tuple"
|
assert isinstance(kv, tuple), "kv_cache must be a tuple"
|
||||||
torch._dynamo.mark_static(kv[0])
|
torch._dynamo.mark_static(kv[0])
|
||||||
torch._dynamo.mark_static(kv[1])
|
torch._dynamo.mark_static(kv[1])
|
||||||
|
if is_310p():
|
||||||
maybe_converting_weight_acl_format(self.model,
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
|
||||||
ACL_FORMAT_FRACTAL_NZ)
|
|
||||||
|
|
||||||
compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
|
compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
|
||||||
model_kwargs = {}
|
model_kwargs = {}
|
||||||
@@ -152,6 +152,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if is_310p():
|
||||||
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
||||||
hidden_states = super()._generate_dummy_run_hidden_states(
|
hidden_states = super()._generate_dummy_run_hidden_states(
|
||||||
with_prefill, is_torchair_compile, input_ids, positions,
|
with_prefill, is_torchair_compile, input_ids, positions,
|
||||||
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
|
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
|
||||||
@@ -261,9 +263,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
"attn_metadata": attn_metadata
|
"attn_metadata": attn_metadata
|
||||||
}
|
}
|
||||||
if not with_prefill:
|
if not with_prefill:
|
||||||
maybe_converting_weight_acl_format(self.model,
|
if is_310p():
|
||||||
ACL_FORMAT_FRACTAL_NZ)
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
|
||||||
|
|
||||||
compiled_model = self._get_torchair_lazy_compiled_model(
|
compiled_model = self._get_torchair_lazy_compiled_model(
|
||||||
padded_num_tokens_across_dp)
|
padded_num_tokens_across_dp)
|
||||||
hidden_states = compiled_model(
|
hidden_states = compiled_model(
|
||||||
@@ -275,8 +276,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
maybe_converting_weight_acl_format(self.model,
|
if is_310p():
|
||||||
ACL_FORMAT_FRACTAL_ND)
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
||||||
|
|
||||||
hidden_states = self.model(
|
hidden_states = self.model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from contextlib import contextmanager, nullcontext
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import torch_npu
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Recent release of torchair has moved these ops to `.scope`.
|
# Recent release of torchair has moved these ops to `.scope`.
|
||||||
@@ -125,6 +126,25 @@ def npu_wait_tensor(self: torch.Tensor,
|
|||||||
return _npu_wait_tensor(self, dependency) if enabled else self
|
return _npu_wait_tensor(self, dependency) if enabled else self
|
||||||
|
|
||||||
|
|
||||||
|
def converting_weight_acl_format(model, format):
|
||||||
|
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
|
||||||
|
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
|
||||||
|
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
|
||||||
|
# conversion when using torchair graph mode on 300I Duo platform.
|
||||||
|
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
|
||||||
|
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
|
||||||
|
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||||
|
|
||||||
|
for module in model.modules():
|
||||||
|
if isinstance(module, FusedMoE):
|
||||||
|
if torch_npu.get_npu_format(module.w13_weight.data) == format:
|
||||||
|
return
|
||||||
|
module.w13_weight.data = torch_npu.npu_format_cast(
|
||||||
|
module.w13_weight.data, format)
|
||||||
|
module.w2_weight.data = torch_npu.npu_format_cast(
|
||||||
|
module.w2_weight.data, format)
|
||||||
|
|
||||||
|
|
||||||
def register_torchair_model():
|
def register_torchair_model():
|
||||||
from vllm import ModelRegistry
|
from vllm import ModelRegistry
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ from torch_npu.npu.streams import Event
|
|||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
import vllm_ascend.envs as envs_ascend
|
import vllm_ascend.envs as envs_ascend
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
@@ -170,28 +169,6 @@ def aligned_16(tensor: torch.Tensor):
|
|||||||
return new_tensor
|
return new_tensor
|
||||||
|
|
||||||
|
|
||||||
def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
|
|
||||||
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
|
|
||||||
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
|
|
||||||
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
|
|
||||||
# conversion when using torchair graph mode on 300I Duo platform.
|
|
||||||
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
|
|
||||||
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
|
|
||||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
|
||||||
|
|
||||||
use_torchair = get_ascend_config().torchair_graph_config.enabled
|
|
||||||
if not is_310p() or not use_torchair:
|
|
||||||
return
|
|
||||||
for module in model.modules():
|
|
||||||
if isinstance(module, FusedMoE):
|
|
||||||
if torch_npu.get_npu_format(module.w13_weight.data) == format:
|
|
||||||
return
|
|
||||||
module.w13_weight.data = torch_npu.npu_format_cast(
|
|
||||||
module.w13_weight.data, format)
|
|
||||||
module.w2_weight.data = torch_npu.npu_format_cast(
|
|
||||||
module.w2_weight.data, format)
|
|
||||||
|
|
||||||
|
|
||||||
def try_register_lib(lib_name: str, lib_info: str = ""):
|
def try_register_lib(lib_name: str, lib_info: str = ""):
|
||||||
import importlib
|
import importlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
|||||||
@@ -95,7 +95,6 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
|
|||||||
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
|
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
|
||||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||||
ProfileExecuteDuration, is_310p,
|
ProfileExecuteDuration, is_310p,
|
||||||
maybe_converting_weight_acl_format,
|
|
||||||
vllm_version_is)
|
vllm_version_is)
|
||||||
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
||||||
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
||||||
@@ -1265,7 +1264,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
intermediate_tensors,
|
intermediate_tensors,
|
||||||
inputs_embeds):
|
inputs_embeds):
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
|
||||||
hidden_states = self.model(
|
hidden_states = self.model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
@@ -1880,7 +1878,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
is_torchair_compile, input_ids,
|
is_torchair_compile, input_ids,
|
||||||
positions, attn_metadata, num_tokens,
|
positions, attn_metadata, num_tokens,
|
||||||
intermediate_tensors, inputs_embeds):
|
intermediate_tensors, inputs_embeds):
|
||||||
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
|
||||||
hidden_states = self.model(input_ids=input_ids,
|
hidden_states = self.model(input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
|
|||||||
Reference in New Issue
Block a user