diff --git a/.github/workflows/schedule_image_build_and_push.yaml b/.github/workflows/schedule_image_build_and_push.yaml index dce1484e..c4bb64ee 100644 --- a/.github/workflows/schedule_image_build_and_push.yaml +++ b/.github/workflows/schedule_image_build_and_push.yaml @@ -43,10 +43,10 @@ jobs: - name: A3 openEuler dockerfile: Dockerfile.a3.openEuler suffix: 'a3-openeuler' - # - name: 310P Ubuntu - # dockerfile: Dockerfile.310p - # - name: 310P openEuler - # dockerfile: Dockerfile.310p.openEuler + - name: 310P Ubuntu + dockerfile: Dockerfile.310p + - name: 310P openEuler + dockerfile: Dockerfile.310p.openEuler uses: ./.github/workflows/_schedule_image_build.yaml with: dockerfile: ${{ matrix.build_meta.dockerfile }} diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py index bf03aa5c..63ddb018 100644 --- a/tests/ut/ops/test_activation.py +++ b/tests/ut/ops/test_activation.py @@ -21,6 +21,7 @@ from vllm.config import set_current_vllm_config from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm_ascend.utils import AscendDeviceType +from vllm_ascend.utils import is_310p as is_310p_hw @pytest.fixture @@ -51,18 +52,26 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor, default_vllm_config): mock_gelu.assert_called_once() +@pytest.mark.skipif(is_310p_hw(), reason="310P operator classes have already been refactored.") @pytest.mark.parametrize("is_310p", [True, False]) @patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1) @patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None) -@patch("torch.ops.vllm.maybe_prefetch_mlp_down_proj", - side_effect=lambda x: None) -def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, - mock_maybe_wait_prefetch_done, mock_swiglu, - is_310p, dummy_tensor, default_vllm_config): +@patch("torch.ops.vllm.maybe_prefetch_mlp_down_proj", side_effect=lambda x: None) +def test_SiluAndMul_forward( + mock_maybe_prefetch_mlp_down_proj, + mock_maybe_wait_prefetch_done, + mock_swiglu, + is_310p, + dummy_tensor, + default_vllm_config, +): + if is_310p and (not is_310p_hw()): + pytest.skip("Pseudo-310P param case is not valid on non-310P CI after refactor.") - with patch("vllm_ascend.utils.get_ascend_device_type", - return_value=AscendDeviceType._310P - if is_310p else AscendDeviceType.A3): + with patch( + "vllm_ascend.utils.get_ascend_device_type", + return_value=AscendDeviceType._310P if is_310p else AscendDeviceType.A3, + ): layer = SiluAndMul() out = layer.forward(dummy_tensor) @@ -81,9 +90,7 @@ def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, mock_maybe_wait_prefetch_done.assert_called_once() actual_arg = mock_swiglu.call_args[0][0] - assert torch.allclose( - actual_arg, - expected_arg), "npu_swiglu called with unexpected input" + assert torch.allclose(actual_arg, expected_arg), "npu_swiglu called with unexpected input" expected_out = dummy_tensor + 1 assert torch.allclose(out, expected_out) diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index 3f9ccdc9..6805ef06 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -5,8 +5,9 @@ import torch from vllm.config import set_current_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm -from vllm_ascend.utils import AscendDeviceType -from vllm_ascend.utils import enable_custom_op +from vllm_ascend.utils import AscendDeviceType, enable_custom_op +from vllm_ascend.utils import is_310p as is_310p_hw + enable_custom_op() @@ -22,12 +23,12 @@ def mock_rms_norm(x, weight, eps): def mock_add_rms_norm(x, residual, weight, eps): return 2 * x, None, 2 * residual + def mock_add_rms_norm_bias(x, residual, weight, bias, eps): if bias is None: return 2 * x, None, 2 * residual else: return 2 * x + bias, None, 2 * residual - @pytest.fixture(autouse=True) @@ -39,18 +40,22 @@ def default_vllm_config(): yield mock_config +@pytest.mark.skipif(is_310p_hw(), reason="310P operator classes have already been refactored.") @pytest.mark.parametrize("is_310p", [True, False]) -@pytest.mark.parametrize("residual", - [None, torch.randn(4, 8, dtype=torch.float32)]) +@pytest.mark.parametrize("residual", [None, torch.randn(4, 8, dtype=torch.float32)]) @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) @patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm) @patch("torch.ops._C_ascend.npu_add_rms_norm_bias", side_effect=mock_add_rms_norm_bias) -def test_RMSNorm_forward(mock_add_rms_norm_bias, mock_add_rmsnorm, mock_rmsnorm, is_310p, residual, - dummy_tensor, default_vllm_config): +def test_RMSNorm_forward( + mock_add_rms_norm_bias, mock_add_rmsnorm, mock_rmsnorm, is_310p, residual, dummy_tensor, default_vllm_config +): + if is_310p and (not is_310p_hw()): + pytest.skip("Pseudo-310P branch is invalid on non-310P CI after refactor.") - with patch("vllm_ascend.utils.get_ascend_device_type", - return_value=AscendDeviceType._310P - if is_310p else AscendDeviceType.A3): + with patch( + "vllm_ascend.utils.get_ascend_device_type", + return_value=AscendDeviceType._310P if is_310p else AscendDeviceType.A3, + ): layer = RMSNorm(hidden_size=8, eps=1e-05) if residual is not None: out_x, out_residual = layer.forward_oot(dummy_tensor, residual) diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index 9e3ccf2f..feaef523 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -25,7 +25,7 @@ from vllm_ascend._310p.attention.metadata_builder import AscendAttentionMetadata from vllm_ascend.attention.attention_v1 import AscendAttentionBackend as _BaseBackend from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl as _BaseImpl from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, aligned_16, nd_to_nz_2d +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, nd_to_nz_2d class AscendAttentionBackend310(_BaseBackend): @@ -64,8 +64,6 @@ class AscendAttentionBackendImpl310(_BaseImpl): def _forward_prefill_310p_fallback(self, query, key, value, attn_metadata, output): real_tokens = int(attn_metadata.seq_lens.sum().item()) - query, key, value, output = (aligned_16(t) for t in (query, key, value, output)) - seq_len = attn_metadata.seq_lens if seq_len.dtype != torch.int32: seq_len = seq_len.to(torch.int32) diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py new file mode 100644 index 00000000..7b115fb7 --- /dev/null +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -0,0 +1,186 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from __future__ import annotations + +from typing import Any + +import torch +import torch_npu +from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig +from vllm.v1.worker.utils import bind_kv_cache + +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner + + +class NPUModelRunner310(NPUModelRunner): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._acl_format = ACL_FORMAT_FRACTAL_NZ + + def initialize_kv_cache_tensors( + self, + kv_cache_config: KVCacheConfig, + ) -> dict[str, Any]: + """ + Initialize KV cache tensors for 310P. + + 1) allocate buffers + 2) reshape / transform to the final layout + 3) optional cross-layer sharing + 4) bind buffers to the static forward context + """ + # 310P limitation: KV transfer is not supported. + if self.vllm_config.kv_transfer_config is not None: + raise ValueError("KV cache transfer is not supported for 310P.") + + kv_cache_raw_tensors = self._allocate_kv_cache_tensors_310p(kv_cache_config) + kv_caches = self._reshape_kv_cache_tensors_310p(kv_cache_config, kv_cache_raw_tensors) + + # Keep the same cross-layer KV cache sharing logic as the main branch. + # For 310P, this is expected to be empty in most cases, but keeping it + # makes the code path consistent and easier to reason about. + for layer_name, target_layer_name in self.shared_kv_cache_layers.items(): + kv_caches[layer_name] = kv_caches[target_layer_name] + + # 310P devices do not support the "longcat_flash" special case here, so always be "1". + bind_kv_cache( + kv_caches, + self.compilation_config.static_forward_context, + self.kv_caches, + 1, + ) + return kv_caches + + def _allocate_kv_cache_tensors_310p( + self, + kv_cache_config: KVCacheConfig, + ) -> dict[str, tuple[torch.Tensor, torch.Tensor]]: + """ + Allocate KV cache buffers for each attention layer. + + Unlike the non-310p path, 310P uses torch.zeros directly with the final dtype, + and defers layout casting (ACL format) to the reshape step. + """ + # Build a mapping: layer_name -> tensor_size(bytes). + kv_cache_sizes: dict[str, int] = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + # 310P limitation: a KV cache tensor must not be shared by multiple layers. + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in 310P." + ) + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + + kv_cache_raw_tensors: dict[str, tuple[torch.Tensor, torch.Tensor]] = {} + + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec + attn_backend = group.backend + + if not isinstance(kv_cache_spec, FullAttentionSpec): + raise ValueError("Unknown KV cache spec type.") + + for layer_name in group.layer_names: + if layer_name in self.runner_only_attn_layers: + continue + + if "attn" not in layer_name: + continue + + # Compute how many blocks this layer can hold. + tensor_size = kv_cache_sizes[layer_name] + assert tensor_size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_size // kv_cache_spec.page_size_bytes + + # `num_blocks` must be >= the number KVCacheManager may allocate. + assert num_blocks >= kv_cache_config.num_blocks + + # Determine the KV cache shape from backend. + kv_cache_shape = self._get_kv_cache_shape_310p( + attn_backend=attn_backend, + kv_cache_spec=kv_cache_spec, + num_blocks=num_blocks, + ) + + shape = kv_cache_shape[1:] + dtype = kv_cache_spec.dtype + + k_tensor = torch.zeros(shape, dtype=dtype, device=self.device) + v_tensor = torch.zeros(shape, dtype=dtype, device=self.device) + kv_cache_raw_tensors[layer_name] = (k_tensor, v_tensor) + + return kv_cache_raw_tensors + + def _reshape_kv_cache_tensors_310p( + self, + kv_cache_config: KVCacheConfig, + kv_cache_raw_tensors: dict[str, tuple[torch.Tensor, torch.Tensor]], + ) -> dict[str, Any]: + """ + Transform allocated KV cache buffers into the final layout required by 310P. + + For 310P, this mainly means casting tensors into the expected ACL format. + """ + kv_caches: dict[str, Any] = {} + + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec + if not isinstance(kv_cache_spec, FullAttentionSpec): + raise ValueError("Unknown KV cache spec type.") + + for layer_name in group.layer_names: + if layer_name in self.runner_only_attn_layers: + continue + if "attn" not in layer_name: + continue + + k_tensor, v_tensor = kv_cache_raw_tensors[layer_name] + + # In-place ACL layout cast to avoid the extra allocation of npu_format_cast, + # which can spike peak memory (~2x KV cache) during initialization and trigger OOM. + torch_npu.npu_format_cast_(k_tensor, self._acl_format) + torch_npu.npu_format_cast_(v_tensor, self._acl_format) + kv_caches[layer_name] = (k_tensor, v_tensor) + + return kv_caches + + def _get_kv_cache_shape_310p( + self, + attn_backend: Any, + kv_cache_spec: FullAttentionSpec, + num_blocks: int, + ) -> tuple[int, ...]: + """ + Compute KV cache shape with (optional) hybrid block support. + """ + if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks: + block_size = attn_backend.get_supported_block_size()[0] + block_size_chunk = kv_cache_spec.block_size // block_size + return attn_backend.get_kv_cache_shape( + num_blocks * block_size_chunk, + block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + ) + + return attn_backend.get_kv_cache_shape( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + ) diff --git a/vllm_ascend/_310p/modelrunner_310p.py b/vllm_ascend/_310p/modelrunner_310p.py deleted file mode 100644 index e83ac39c..00000000 --- a/vllm_ascend/_310p/modelrunner_310p.py +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -from __future__ import annotations - -from typing import Any - -import torch -import torch_npu -from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig -from vllm.v1.worker.utils import bind_kv_cache - -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ -from vllm_ascend.worker.model_runner_v1 import NPUModelRunner - - -class NPUModelRunner310(NPUModelRunner): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._acl_format = ACL_FORMAT_FRACTAL_NZ - - def _initialize_kv_cache_tensors_310p(self, kv_cache_config: KVCacheConfig) -> dict[str, Any]: - if self.vllm_config.kv_transfer_config is not None: - raise ValueError("KV cache transfer is not supported for 310P.") - - kv_cache_sizes: dict[str, int] = {} - for kv_cache_tensor in kv_cache_config.kv_cache_tensors: - assert len(kv_cache_tensor.shared_by) == 1, ( - "KV cache tensor shared by multiple layers is not supported in 310P." - ) - kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size - - kv_caches: dict[str, Any] = {} - - for group in self._kv_cache_spec_attn_group_iterator(): - kv_cache_spec = group.kv_cache_spec - attn_backend = group.backend - - if not isinstance(kv_cache_spec, FullAttentionSpec): - raise ValueError("Unknown KV cache spec type.") - - for layer_name in group.layer_names: - if layer_name in self.runner_only_attn_layers: - continue - - tensor_size = kv_cache_sizes[layer_name] - assert tensor_size % kv_cache_spec.page_size_bytes == 0 - num_blocks = tensor_size // kv_cache_spec.page_size_bytes - assert num_blocks >= kv_cache_config.num_blocks - - if hasattr(attn_backend, "get_supported_block_size") and self.use_hybrid_blocks: - block_size = attn_backend.get_supported_block_size()[0] - block_size_chunk = kv_cache_spec.block_size // block_size - kv_cache_shape = attn_backend.get_kv_cache_shape( - num_blocks * block_size_chunk, - block_size, - kv_cache_spec.num_kv_heads, - kv_cache_spec.head_size, - ) - else: - kv_cache_shape = attn_backend.get_kv_cache_shape( - num_blocks, - kv_cache_spec.block_size, - kv_cache_spec.num_kv_heads, - kv_cache_spec.head_size, - ) - - dtype = kv_cache_spec.dtype - - if "attn" in layer_name: - k_tensor = torch.zeros(kv_cache_shape[1:], dtype=dtype, device=self.device) - v_tensor = torch.zeros(kv_cache_shape[1:], dtype=dtype, device=self.device) - k_cache = torch_npu.npu_format_cast(k_tensor, self._acl_format) - v_cache = torch_npu.npu_format_cast(v_tensor, self._acl_format) - kv_caches[layer_name] = (k_cache, v_cache) - - bind_kv_cache( - kv_caches, - self.compilation_config.static_forward_context, - self.kv_caches, - 1, # 310p devices donnot support: hf_config.model_type == "longcat_flash" - ) - return kv_caches - - def initialize_kv_cache_tensors(self, kv_cache_config: KVCacheConfig) -> dict[str, Any]: - return self._initialize_kv_cache_tensors_310p(kv_cache_config) diff --git a/vllm_ascend/_310p/ops/layernorm.py b/vllm_ascend/_310p/ops/layernorm.py new file mode 100644 index 00000000..d1b4978c --- /dev/null +++ b/vllm_ascend/_310p/ops/layernorm.py @@ -0,0 +1,44 @@ +import torch +import torch_npu + +from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm + + +class AscendRMSNorm310(AscendRMSNorm): + def forward_oot( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if residual is not None: + orig_dtype = residual.dtype + if x is None or x.numel() == 0 or x.shape[-1] == 0: + x = residual.to(dtype=residual.dtype) + else: + x = x + residual.to(x.dtype) + + residual = x.to(orig_dtype) + x, _ = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) + return x, residual + + x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) + if self.bias is not None: + x.add_(self.bias) + return x + + +class AscendGemmaRMSNorm310(AscendGemmaRMSNorm): + def forward_oot( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if residual is not None: + orig_dtype = residual.dtype + x = x + residual.to(x.dtype) + residual = x.to(orig_dtype) + x, _ = torch_npu.npu_rms_norm(x, 1.0 + self.weight, self.variance_epsilon) + return x, residual + + x, _ = torch_npu.npu_rms_norm(x, 1.0 + self.weight, self.variance_epsilon) + return x diff --git a/vllm_ascend/_310p/ops/mm_encoder_attention.py b/vllm_ascend/_310p/ops/mm_encoder_attention.py index ebe33558..97481879 100644 --- a/vllm_ascend/_310p/ops/mm_encoder_attention.py +++ b/vllm_ascend/_310p/ops/mm_encoder_attention.py @@ -17,11 +17,8 @@ import einops import torch -import torch.nn.functional as F import torch_npu -import vllm_ascend.envs as envs_ascend -from vllm_ascend.ops.mm_encoder_attention import MAX_PAD_SIZE, MIN_PAD_SIZE from vllm_ascend.ops.mm_encoder_attention import AscendMMEncoderAttention as _Base @@ -43,23 +40,6 @@ class AscendMMEncoderAttention310(_Base): q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len) - enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE - - origin_shape = q.shape[-1] - if enable_pad: - pad_len = MAX_PAD_SIZE - origin_shape - q = F.pad(q, (0, pad_len), mode="constant", value=0) - k = F.pad(k, (0, pad_len), mode="constant", value=0) - v = F.pad(v, (0, pad_len), mode="constant", value=0) - - origin_dim = origin_shape - cur_dim = q.shape[-1] - pad16 = (16 - cur_dim % 16) % 16 - if pad16: - q = F.pad(q, (0, pad16), mode="constant", value=0) - k = F.pad(k, (0, pad16), mode="constant", value=0) - v = F.pad(v, (0, pad16), mode="constant", value=0) - if cu_seqlens is None: cu_seqlens = torch.arange( 0, @@ -69,36 +49,19 @@ class AscendMMEncoderAttention310(_Base): device=query.device, ) - total_q_tokens = bsz * q_len - context_flat = q.new_empty((total_q_tokens, self.num_heads, q.shape[-1])) + seq_len = torch.diff(cu_seqlens).to("cpu", dtype=torch.int32) - st = 0 - seg_lens = torch.diff(cu_seqlens).to("cpu", dtype=torch.int64).tolist() - for seg_len in seg_lens: - seg_len = int(seg_len) - ed = st + seg_len + context_layer = torch.empty_like(q) + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=seq_len, + scale_value=self.head_size**-0.5, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + out=context_layer, + ) - q_i = q[st:ed].unsqueeze(0) # [1, S, H, D] - k_i = k[st:ed].unsqueeze(0) - v_i = v[st:ed].unsqueeze(0) - - qs = int(q_i.shape[1]) - kvs = int(k_i.shape[1]) - - out_i = torch_npu.npu_prompt_flash_attention( - q_i, - k_i, - v_i, - input_layout="BSND", - num_heads=self.num_heads, - num_key_value_heads=self.num_kv_heads, - scale_value=self.head_size**-0.5, - pre_tokens=qs, - next_tokens=kvs, - ) - context_flat[st:ed] = out_i[0] - st = ed - - context_flat = context_flat[..., :origin_dim] - context_layer = einops.rearrange(context_flat, "(b s) h d -> b s h d", b=bsz).contiguous() + context_layer = einops.rearrange(context_layer, "(b s) h d -> b s h d", b=bsz).contiguous() return context_layer diff --git a/vllm_ascend/_310p/worker_310p.py b/vllm_ascend/_310p/worker_310p.py index adfb00fc..9565a5ec 100644 --- a/vllm_ascend/_310p/worker_310p.py +++ b/vllm_ascend/_310p/worker_310p.py @@ -18,7 +18,7 @@ import torch_npu from vllm.logger import logger -from vllm_ascend._310p.modelrunner_310p import NPUModelRunner310 +from vllm_ascend._310p.model_runner_310p import NPUModelRunner310 from vllm_ascend.worker.worker import NPUWorker, init_workspace_manager diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py index 4889d232..a605b87c 100644 --- a/vllm_ascend/ops/activation.py +++ b/vllm_ascend/ops/activation.py @@ -33,12 +33,7 @@ class AscendSiluAndMul(SiluAndMul): def forward_oot(self, x: torch.Tensor) -> torch.Tensor: import torch_npu - from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type - torch.ops.vllm.maybe_prefetch_mlp_down_proj(x) - if get_ascend_device_type() == AscendDeviceType._310P: - out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16) - else: - out = torch_npu.npu_swiglu(x) + out = torch_npu.npu_swiglu(x) torch.ops.vllm.maybe_wait_prefetch_done(out) return out diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 88a005cc..dd676e8f 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -52,15 +52,8 @@ class AscendRMSNorm(RMSNorm): ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: import torch_npu - from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type if residual is not None: - if get_ascend_device_type() == AscendDeviceType._310P: - orig_dtype = residual.dtype - x = x + residual.to(x.dtype) - residual = x.to(orig_dtype) - x, _ = torch_npu.npu_rms_norm(x, self.weight, - self.variance_epsilon) - elif enable_custom_op(): + if enable_custom_op(): x, _, residual = torch.ops._C_ascend.npu_add_rms_norm_bias( x, residual, self.weight, self.bias, self.variance_epsilon) else: @@ -88,13 +81,7 @@ class AscendGemmaRMSNorm(GemmaRMSNorm): from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type if residual is not None: - if get_ascend_device_type() == AscendDeviceType._310P: - orig_dtype = residual.dtype - x = x + residual.to(x.dtype) - residual = x.to(orig_dtype) - x, _ = torch_npu.npu_rms_norm(x, 1.0 + self.weight, - self.variance_epsilon) - elif enable_custom_op(): + if enable_custom_op(): x, _, residual = torch.ops._C_ascend.npu_add_rms_norm_bias( x, residual, 1.0 + self.weight, None, self.variance_epsilon) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 238c5c0f..eb88afdd 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -721,16 +721,17 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): # 310P: override selected ops with 310P implementations (keep minimal changes outside _310p) if is_310p(): from vllm_ascend._310p.ops.activation import AscendSiluAndMul310 + from vllm_ascend._310p.ops.layernorm import AscendGemmaRMSNorm310, AscendRMSNorm310 from vllm_ascend._310p.ops.mm_encoder_attention import AscendMMEncoderAttention310 - from vllm_ascend._310p.ops.rotary_embedding import ( - AscendMRotaryEmbedding310, - ) + from vllm_ascend._310p.ops.rotary_embedding import AscendMRotaryEmbedding310 REGISTERED_ASCEND_OPS.update( { "SiluAndMul": AscendSiluAndMul310, "MMEncoderAttention": AscendMMEncoderAttention310, "MRotaryEmbedding": AscendMRotaryEmbedding310, + "RMSNorm": AscendRMSNorm310, + "GemmaRMSNorm": AscendGemmaRMSNorm310, } )