diff --git a/CMakeLists.txt b/CMakeLists.txt index b64611d..272bdb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,13 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") find_package(Torch REQUIRED) +run_python(TORCH_VERSION + "import torch; print(torch.__version__)" "Failed to locate torch path") +# check torch version is 2.7.1 +if(NOT ${TORCH_VERSION} VERSION_EQUAL "2.7.1") + message(FATAL_ERROR "Expected PyTorch version 2.7.1, but found ${TORCH_VERSION}") +endif() + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") set(SOC_VERSION ${SOC_VERSION}) message(STATUS "Detected SOC version: ${SOC_VERSION}") diff --git a/README.md b/README.md index 9c255b1..4d8aeea 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - Software: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (the same version as vllm-ascend) ## Getting Started diff --git a/README.zh.md b/README.zh.md index bb7ddb9..36d5a87 100644 --- a/README.zh.md +++ b/README.zh.md @@ -44,7 +44,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 软件: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (与vllm-ascend版本一致) ## 开始使用 diff --git a/docs/source/installation.md b/docs/source/installation.md index 708b283..526206c 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -13,8 +13,8 @@ This document describes how to install vllm-ascend manually. |---------------|----------------------------------|-------------------------------------------| | Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN | | CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu | - | torch-npu | >= 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | - | torch | >= 2.7.1 | Required for torch-npu and vllm | + | torch-npu | == 2.7.1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | + | torch | == 2.7.1 | Required for torch-npu and vllm | There are two installation methods: - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md index 3e916cc..ea76f0d 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md @@ -5,7 +5,7 @@ * Software: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (same version as vllm-ascend) * mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md index d33c781..8264021 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md @@ -5,7 +5,7 @@ * Software: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM:main branch * vLLM-Ascend:main branch * Mooncake:[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.) diff --git a/pyproject.toml b/pyproject.toml index 8ac1b7e..e5c9d7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,8 @@ requires = [ "scipy", "setuptools>=64", "setuptools-scm>=8", - "torch-npu==2.7.1.dev20250724", - "torch>=2.7.1", + "torch-npu==2.7.1", + "torch==2.7.1", "torchvision", "wheel", "msgpack", diff --git a/requirements.txt b/requirements.txt index d32d475..0947fe3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ pyyaml scipy setuptools>=64 setuptools-scm>=8 -torch>=2.7.1 +torch==2.7.1 torchvision wheel opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm @@ -23,6 +23,6 @@ quart numba # Install torch_npu ---pre ---extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi -torch-npu==2.7.1.dev20250724 +#--pre +#--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi +torch-npu==2.7.1 diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index 7a829b6..9da7eb5 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -7,7 +7,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from tests.ut.base import PytestBase from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod -from vllm_ascend.utils import version_check def mock_rms_norm(x, weight, eps): @@ -18,15 +17,6 @@ def mock_add_rms_norm(x, residual, weight, eps): return 2 * x, None, 2 * residual -def mock_add_rms_norm_quant(x, residual, weight, quant_scale, quant_offset, - epsilon): - x_out = 2 * x - residual_out = 2 * residual - x_out_quant = x_out.to(torch.int8) - residual_out_quant = residual_out.to(torch.int8) - return x_out_quant, None, residual_out_quant - - def mock_add_rms_norm_quant_with_bias(x, residual, weight, quant_scale, quant_offset, beta, epsilon): x_out = 2 * x @@ -43,10 +33,8 @@ class TestAscendRMSNorm(PytestBase): mocker.patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) mocker.patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm) - torch_npu_check = version_check() - arnq_side_effect = mock_add_rms_norm_quant_with_bias if torch_npu_check else mock_add_rms_norm_quant mocker.patch("torch_npu.npu_add_rms_norm_quant", - side_effect=arnq_side_effect) + side_effect=mock_add_rms_norm_quant_with_bias) mocker.patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None) @@ -82,8 +70,7 @@ class TestAscendRMSNorm(PytestBase): mock_model_instance = mocker.MagicMock() mock_forward_context.model_instance = mock_model_instance - torch_npu_check = version_check() - num_hidden_layers = 3 if torch_npu_check else 2 + num_hidden_layers = 3 mock_model_instance.model.layers = [ mocker.MagicMock() for _ in range(num_hidden_layers) ] @@ -136,37 +123,34 @@ class TestAscendRMSNorm(PytestBase): assert mock_forward_context.fusion_linear == "gate_up_dense" assert mock_forward_context.layer_idx == 1 - if torch_npu_check: - mock_forward_context.fusion_linear = "gate_moe" + mock_forward_context.fusion_linear = "gate_moe" + x_out, residual_out = layer.forward_oot(x, residual) + + assert mock_get_forward_context.call_count == 5 + fusion_linear_expected = "qkv_moe" + assert mock_forward_context.fusion_linear == fusion_linear_expected + assert mock_forward_context.layer_idx == 2 + x_out, residual_out = layer.forward_oot(x, residual) assert mock_get_forward_context.call_count == 6 - fusion_linear_expected = "qkv_moe" if torch_npu_check else "qkv_dense" + fusion_linear_expected = "gate_moe" assert mock_forward_context.fusion_linear == fusion_linear_expected assert mock_forward_context.layer_idx == 2 + # last layer returned directly x_out, residual_out = layer.forward_oot(x, residual) assert mock_get_forward_context.call_count == 7 - fusion_linear_expected = "gate_moe" if torch_npu_check else "qkv_dense" - assert mock_forward_context.fusion_linear == fusion_linear_expected - assert mock_forward_context.layer_idx == 2 + assert mock_forward_context.fusion_linear == "qkv_moe" + assert mock_forward_context.layer_idx == 3 - if not torch_npu_check: - return - # last layer returned directly x_out, residual_out = layer.forward_oot(x, residual) assert mock_get_forward_context.call_count == 8 assert mock_forward_context.fusion_linear == "qkv_moe" assert mock_forward_context.layer_idx == 3 - x_out, residual_out = layer.forward_oot(x, residual) - - assert mock_get_forward_context.call_count == 9 - assert mock_forward_context.fusion_linear == "qkv_moe" - assert mock_forward_context.layer_idx == 3 - if __name__ == '__main__': unittest.main() diff --git a/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py index 09b5aa3..994f5c8 100644 --- a/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py +++ b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py @@ -23,9 +23,9 @@ class TestAscendW8A8FusedMoEMethod(TestBase): @patch("torch_npu.npu_swiglu") @patch("torch_npu.npu_dynamic_quant") @patch("torch_npu.npu_moe_finalize_routing") - @patch("torch_npu.npu_moe_init_routing") + @patch("torch_npu.npu_moe_init_routing_quant") def test_torchair_fused_experts_with_all2all( - self, mock_moe_init_routing, mock_moe_finalize_routing, + self, mock_npu_moe_init_routing_quant, mock_moe_finalize_routing, mock_dynamic_quant, mock_swiglu, mock_grouped_matmul, mock_moe_re_routing, mock_all_to_all_single): @@ -38,11 +38,10 @@ class TestAscendW8A8FusedMoEMethod(TestBase): placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32) mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_( input) - mock_moe_init_routing.return_value = ( - placeholder_int8, - placeholder_ones, - placeholder_ones, - ) + mock_npu_moe_init_routing_quant.return_value = ( + placeholder_int8, placeholder_ones, placeholder_ones, + torch.bincount(placeholder_ones, minlength=len(expert_map)), + torch.randn(self.num_tokens)) mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder, torch.randint(0, 100, diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index c929439..a700fbf 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -11,8 +11,7 @@ from vllm.forward_context import (BatchDescriptor, get_forward_context, set_forward_context) import vllm_ascend.envs as envs_ascend -from vllm_ascend.utils import (enable_sp, has_layer_idx, is_moe_model, - version_check) +from vllm_ascend.utils import enable_sp, has_layer_idx, is_moe_model if TYPE_CHECKING: from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod @@ -162,9 +161,7 @@ def set_ascend_forward_context( # this optim now just support dense models due to the specific operators used. # Once the necessary conditions are met, support for MOE models will also be added. from vllm_ascend.quantization.quant_config import AscendQuantConfig - model_type_scope = ["llama", "qwen2", "qwen3"] - if version_check(): - model_type_scope.append("qwen3_moe") + model_type_scope = ["llama", "qwen2", "qwen3", "qwen3_moe"] addrmsnorm_quant_fusion_enabled = isinstance(vllm_config.quant_config, AscendQuantConfig) and \ vllm_config.model_config.hf_config.model_type in model_type_scope and \ forward_context.layer_idx is not None diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 799853f..e03eda6 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -38,7 +38,7 @@ from vllm_ascend.compilation.acl_graph import (get_graph_params, update_graph_params_workspaces) from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec, version_check) + nd_to_nz_2d, nd_to_nz_spec) from ..utils import weak_ref_tensors @@ -321,7 +321,6 @@ class AscendAttentionBackendImpl(AttentionImpl): self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.key_cache = None self.value_cache = None - self.torch_npu_check = version_check() def _forward_prefill_no_cache( self, @@ -429,22 +428,21 @@ class AscendAttentionBackendImpl(AttentionImpl): forward_context: ForwardContext = get_forward_context() num_tokens = query.shape[0] if forward_context.capturing: - if self.torch_npu_check: - # Get workspace from cache or calculate it if not present. - workspace = graph_params.workspaces.get(num_tokens) - if workspace is None: - workspace = torch_npu._npu_paged_attention_get_workspace( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=attn_metadata.block_tables, - context_lens=attn_metadata.seq_lens, - out=output) - update_graph_params_workspaces( - num_tokens, weak_ref_tensors(workspace)) + # Get workspace from cache or calculate it if not present. + workspace = graph_params.workspaces.get(num_tokens) + if workspace is None: + workspace = torch_npu._npu_paged_attention_get_workspace( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output) + update_graph_params_workspaces(num_tokens, + weak_ref_tensors(workspace)) # Handle graph capturing mode stream = torch_npu.npu.current_stream() @@ -466,30 +464,17 @@ class AscendAttentionBackendImpl(AttentionImpl): )) torch.npu.graph_task_group_begin(stream) - - if self.torch_npu_check: - torch_npu._npu_paged_attention( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=attn_metadata.block_tables, - context_lens=attn_metadata.seq_lens, - out=output, - workspace=workspace) - else: - torch_npu._npu_paged_attention( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=attn_metadata.block_tables, - context_lens=attn_metadata.seq_lens, - out=output) + torch_npu._npu_paged_attention( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output, + workspace=workspace) handle = torch.npu.graph_task_group_end(stream) graph_params.handles[num_tokens].append(handle) else: diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index 91d75b5..f2cab32 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -18,8 +18,6 @@ from vllm.forward_context import BatchDescriptor, get_forward_context from vllm.logger import logger from vllm.platforms import current_platform -from vllm_ascend.utils import version_check - from ..utils import weak_ref_tensors @@ -213,32 +211,20 @@ def update_attn_params(update_stream, forward_context, runtime_shape): output, ) = param seq_lens = forward_context.attn_metadata[key].seq_lens - torch_npu_check = version_check() with torch.npu.stream(update_stream): torch.npu.graph_task_update_begin(update_stream, handle) - if torch_npu_check: - torch_npu._npu_paged_attention( - query=query, - key_cache=key_cache, - value_cache=value_cache, - num_kv_heads=num_kv_heads, - num_heads=num_heads, - scale_value=scale, - block_table=block_table, - context_lens=seq_lens, - out=output, - workspace=graph_params.workspaces.get(runtime_shape)) - else: - torch_npu._npu_paged_attention(query=query, - key_cache=key_cache, - value_cache=value_cache, - num_kv_heads=num_kv_heads, - num_heads=num_heads, - scale_value=scale, - block_table=block_table, - context_lens=seq_lens, - out=output) + torch_npu._npu_paged_attention( + query=query, + key_cache=key_cache, + value_cache=value_cache, + num_kv_heads=num_kv_heads, + num_heads=num_heads, + scale_value=scale, + block_table=block_table, + context_lens=seq_lens, + out=output, + workspace=graph_params.workspaces.get(runtime_shape)) torch.npu.graph_task_update_end(update_stream) event.record(update_stream) diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 239eeb0..6b89f4a 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -22,8 +22,6 @@ from vllm.config import get_current_vllm_config from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm -from vllm_ascend.utils import version_check - def _addrmsnorm_forward_oot( self, @@ -36,7 +34,6 @@ def _addrmsnorm_forward_oot( from vllm_ascend.utils import is_310p - torch_npu_check = version_check() if layer is not None and not is_310p(): layer_cls_name = layer.__class__.__name__ try: @@ -53,23 +50,15 @@ def _addrmsnorm_forward_oot( start_flag=x, ) # add_rms_norm_quant - if torch_npu_check: - x, _, residual = torch_npu.npu_add_rms_norm_quant( - x, - residual, - self.weight, - layer.aclnn_input_scale, - layer.aclnn_input_offset, - beta=bias, - epsilon=self.variance_epsilon) - else: - x, _, residual = torch_npu.npu_add_rms_norm_quant( - x, - residual, - self.weight, - layer.aclnn_input_scale, - layer.aclnn_input_offset, - epsilon=self.variance_epsilon) + x, _, residual = torch_npu.npu_add_rms_norm_quant( + x, + residual, + self.weight, + layer.aclnn_input_scale, + layer.aclnn_input_offset, + beta=bias, + epsilon=self.variance_epsilon) + # prefetch qkvo_proj.weight postprocess if weight_prefetch_method: weight_prefetch_method.maybe_prefetch_attn_weight_postprocess( @@ -87,7 +76,7 @@ def _addrmsnorm_forward_oot( else: x, _, residual = torch_npu.npu_add_rms_norm( x, residual, self.weight, self.variance_epsilon) - if torch_npu_check and bias is not None: + if bias is not None: x.add_(bias) torch.ops.vllm.maybe_wait_prefetch_done(x) return x, residual @@ -106,9 +95,8 @@ class AscendRMSNorm(RMSNorm): super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype) vllm_config = get_current_vllm_config() self.bias = None - self.torch_npu_check = version_check() # quantization with anti_method m4 will generate none-zero norm bias - if self.torch_npu_check and vllm_config.quant_config is not None and \ + if vllm_config.quant_config is not None and \ any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()): self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False) @@ -128,7 +116,7 @@ class AscendRMSNorm(RMSNorm): return x, residual x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) - if self.torch_npu_check and self.bias is not None: + if self.bias is not None: x.add_(self.bias) return x diff --git a/vllm_ascend/ops/weight_prefetch.py b/vllm_ascend/ops/weight_prefetch.py index c2548ba..42ff7e0 100644 --- a/vllm_ascend/ops/weight_prefetch.py +++ b/vllm_ascend/ops/weight_prefetch.py @@ -7,7 +7,6 @@ from vllm.forward_context import get_forward_context from vllm_ascend.ascend_config import WeightPrefetchConfig from vllm_ascend.ops.linear import (AscendQKVParallelLinear, AscendRowParallelLinear) -from vllm_ascend.utils import version_check SUPPORTED_MODULES = ["attn", "mlp", "moe"] MOE_PREFETCH_TOKEN_THRESHOLD = 96 @@ -83,8 +82,7 @@ class WeightPrefetchMethod: if not self.moe.is_active_this_forward: return forward_context = get_forward_context() - if not version_check(): - forward_context.layer_idx += 1 + # layer_idx is subtracted by 1 because layer_idx was incremented by 1 at layernorm. weight = forward_context.model_instance.model.layers[ forward_context.layer_idx - 1].mlp.experts.w13_weight weight_size = weight.data.element_size() * weight.data.numel( diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index f5bb63d..27020de 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -510,8 +510,7 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None): from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul from vllm_ascend.ops.common_fused_moe import (AscendFusedMoE, AscendSharedFusedMoE) - from vllm_ascend.ops.layernorm import (AscendGemmaRMSNorm, - AscendQuantRMSNorm, AscendRMSNorm) + from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm from vllm_ascend.ops.linear import (AscendColumnParallelLinear, AscendMergedColumnParallelLinear, AscendQKVParallelLinear, @@ -547,12 +546,6 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None): "MultiHeadLatentAttention": AscendMultiHeadLatentAttention, } - if vllm_config is not None and \ - vllm_config.quant_config is not None and \ - any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()) and \ - not version_check(): - REGISTERED_ASCEND_OPS["RMSNorm"] = AscendQuantRMSNorm - for name, op_cls in REGISTERED_ASCEND_OPS.items(): CustomOp.register_oot(_decorated_op_cls=op_cls, name=name) @@ -743,21 +736,6 @@ def is_hierarchical_communication_enabled(): and os.getenv("HCCL_INTRA_PCIE_ENABLE", "") == "1") -@functools.cache -def version_check(): - """check if torch_npu version >= dev20250919""" - import re - torch_npu_version = torch_npu.version.__version__ - date_pattern = r'dev(\d{8})' - - match = re.search(date_pattern, torch_npu_version) - if match: - full_date = match.group(1) - if full_date >= "20250919": - return True - return False - - def has_layer_idx(model_instance: torch.nn.Module) -> bool: if model_instance is None: return False