[Refactor] cleanup converting_weight_acl_format_format (#2482)

move maybe_converting_weight_acl_format_format to torchair module, it's
only used with 310p+torchair

- vLLM version: v0.10.1.1
- vLLM main:
49ab23b3cc

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-08-25 19:48:55 +08:00
committed by GitHub
parent 0f81e032f0
commit de7649492d
6 changed files with 75 additions and 108 deletions

View File

@@ -36,10 +36,11 @@ from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
check_torchair_cache_exist,
converting_weight_acl_format,
register_torchair_model,
write_kv_cache_bytes_to_file)
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
is_310p, maybe_converting_weight_acl_format)
is_310p)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
@@ -136,9 +137,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
assert isinstance(kv, tuple), "kv_cache must be a tuple"
torch._dynamo.mark_static(kv[0])
torch._dynamo.mark_static(kv[1])
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
model_kwargs = {}
@@ -152,6 +152,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
**model_kwargs,
)
else:
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = super()._generate_dummy_run_hidden_states(
with_prefill, is_torchair_compile, input_ids, positions,
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
@@ -261,9 +263,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
"attn_metadata": attn_metadata
}
if not with_prefill:
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(
padded_num_tokens_across_dp)
hidden_states = compiled_model(
@@ -275,8 +276,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
)
else:
assert self.model is not None
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_ND)
if is_310p():
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(
input_ids=input_ids,

View File

@@ -5,6 +5,7 @@ from contextlib import contextmanager, nullcontext
from dataclasses import dataclass
import torch
import torch_npu
try:
# Recent release of torchair has moved these ops to `.scope`.
@@ -125,6 +126,25 @@ def npu_wait_tensor(self: torch.Tensor,
return _npu_wait_tensor(self, dependency) if enabled else self
def converting_weight_acl_format(model, format):
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
for module in model.modules():
if isinstance(module, FusedMoE):
if torch_npu.get_npu_format(module.w13_weight.data) == format:
return
module.w13_weight.data = torch_npu.npu_format_cast(
module.w13_weight.data, format)
module.w2_weight.data = torch_npu.npu_format_cast(
module.w2_weight.data, format)
def register_torchair_model():
from vllm import ModelRegistry

View File

@@ -32,7 +32,6 @@ from torch_npu.npu.streams import Event
from vllm.logger import logger
import vllm_ascend.envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config
if TYPE_CHECKING:
from vllm.config import VllmConfig
@@ -170,28 +169,6 @@ def aligned_16(tensor: torch.Tensor):
return new_tensor
def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
use_torchair = get_ascend_config().torchair_graph_config.enabled
if not is_310p() or not use_torchair:
return
for module in model.modules():
if isinstance(module, FusedMoE):
if torch_npu.get_npu_format(module.w13_weight.data) == format:
return
module.w13_weight.data = torch_npu.npu_format_cast(
module.w13_weight.data, format)
module.w2_weight.data = torch_npu.npu_format_cast(
module.w2_weight.data, format)
def try_register_lib(lib_name: str, lib_info: str = ""):
import importlib
import importlib.util

View File

@@ -95,7 +95,6 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format,
vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1265,7 +1264,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
intermediate_tensors,
inputs_embeds):
assert self.model is not None
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
@@ -1880,7 +1878,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is_torchair_compile, input_ids,
positions, attn_metadata, num_tokens,
intermediate_tensors, inputs_embeds):
maybe_converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,