diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 2170a9a..96873ef 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -559,9 +559,8 @@ class TestNPUPlatform(TestBase): def test_get_punica_wrapper(self): result = self.platform.get_punica_wrapper() - self.assertEqual( - result, - "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU") + self.assertEqual(result, + "vllm_ascend.lora.punica_npu.PunicaWrapperNPU") @patch("torch.npu.reset_peak_memory_stats") @patch("torch.npu.max_memory_allocated") diff --git a/vllm_ascend/lora/punica_wrapper/lora_ops.py b/vllm_ascend/lora/lora_ops.py similarity index 100% rename from vllm_ascend/lora/punica_wrapper/lora_ops.py rename to vllm_ascend/lora/lora_ops.py diff --git a/vllm_ascend/lora/punica_wrapper/punica_npu.py b/vllm_ascend/lora/punica_npu.py similarity index 97% rename from vllm_ascend/lora/punica_wrapper/punica_npu.py rename to vllm_ascend/lora/punica_npu.py index a85c837..b86ee33 100644 --- a/vllm_ascend/lora/punica_wrapper/punica_npu.py +++ b/vllm_ascend/lora/punica_npu.py @@ -11,12 +11,14 @@ if is_310p(): bgmv_shrink, sgmv_expand, sgmv_expand_slice, sgmv_shrink) else: - from vllm_ascend.lora.punica_wrapper.lora_ops import ( - bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, - sgmv_expand_slice, sgmv_shrink) + from vllm_ascend.lora.lora_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase +from vllm_ascend.lora.utils import refresh_all_lora_classes + # The platforms that are compatible with the PyTorch-native implementation can # inherit this class @@ -31,6 +33,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): device: Union[torch.device, str], **kwargs): PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) + refresh_all_lora_classes() def _shrink_prefill( self, diff --git a/vllm_ascend/lora/punica_wrapper/__init__.py b/vllm_ascend/lora/punica_wrapper/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/vllm_ascend/patch/worker/patch_common/patch_lora_linear.py b/vllm_ascend/lora/utils.py similarity index 62% rename from vllm_ascend/patch/worker/patch_common/patch_lora_linear.py rename to vllm_ascend/lora/utils.py index fdfe51d..47e95cd 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_lora_linear.py +++ b/vllm_ascend/lora/utils.py @@ -1,28 +1,19 @@ from typing import Optional +import vllm from torch import nn from transformers import PretrainedConfig from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, - RowParallelLinearWithLoRA) + RowParallelLinearWithLoRA, + VocabParallelEmbeddingWithLoRA) from vllm_ascend.ops.linear import (AscendColumnParallelLinear, AscendMergedColumnParallelLinear, AscendRowParallelLinear) - - -class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA): - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is AscendRowParallelLinear +from vllm_ascend.ops.vocab_parallel_embedding import \ + AscendVocabParallelEmbedding class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): @@ -50,3 +41,37 @@ class AscendMergedColumnParallelLinearWithLoRA( model_config: Optional[PretrainedConfig], ) -> bool: return type(source_layer) is AscendMergedColumnParallelLinear + + +class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA): + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is AscendRowParallelLinear + + +class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA): + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is AscendVocabParallelEmbedding + + +def refresh_all_lora_classes(): + vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA) + vllm.lora.utils._all_lora_classes.add( + AscendMergedColumnParallelLinearWithLoRA) + vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA) + vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 754a344..fd39f27 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -46,6 +46,15 @@ # Need a PR to vllm to support get port from environment. # Future Plan: # Remove those patch when vllm merged them +# 2. `torch.distributed.all_reduce`, `torch.distributed.broadcast` +# Why: +# tensor alignment for 310p +# How: +# rewrite all_reduce and broadcast in torch.distributed +# Related PR (if no, explain why): +# No, not ready yet. +# Future Plan: +# Find a better way to support tensor alignment for 310p without this patch. # # * Worker Patch: # =============== @@ -86,19 +95,15 @@ # - https://github.com/vllm-project/vllm/pull/21591 # Future Plan: # Revert it when vLLM merge #21591 and release new version -# ** File: worker/patch_common/patch_linear.py ** +# ** File: worker/patch_common/patch_logits.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.layers.linear.RowParallelLinear` +# 1. `vllm._custom_ops.apply_repetition_penalties` # Why: -# We need to fuse matmul and allreuce in `RowParallelLinear` -# to improve performance. +# apply_repetition_penalties in vLLM use tensor.is_cuda to check if tensor is on cuda. But the value is always True +# on ascend, thus we need to patch apply_repetition_penalties. # How: -# Create a new class `AscendRowParallelLinear` that inherits from `RowParallelLinear`. -# In this class, we override the `forward` method to use -# torch_npu.npu_mm_all_reduce_base to replace matmul and allreduce. +# Remove the related cuda check in apply_repetition_penalties. # Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm-ascend/pull/1926 +# - this is a bug by Ascend only. It can' be fixed in vLLM. # Future Plan: -# Validate more models in all kinds of scenario, -# if performance is always improved, we can enable this patch by default and remove the env -# variable `VLLM_ASCEND_ENABLE_FUSE_MATMUL_ALLREDUCE` in the future. +# Fix this bug in torch-npu, bump torch-npu version and remove this patch. diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index 56af25a..a723072 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -17,7 +17,4 @@ import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_logits # noqa -import vllm_ascend.patch.worker.patch_common.patch_lora # noqa -import vllm_ascend.patch.worker.patch_common.patch_lora_embedding # noqa -import vllm_ascend.patch.worker.patch_common.patch_lora_linear # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_lora.py b/vllm_ascend/patch/worker/patch_common/patch_lora.py deleted file mode 100644 index e96f971..0000000 --- a/vllm_ascend/patch/worker/patch_common/patch_lora.py +++ /dev/null @@ -1,15 +0,0 @@ -import vllm -from vllm.lora.utils import _all_lora_classes - -from vllm_ascend.patch.worker.patch_common.patch_lora_embedding import \ - AscendVocabParallelEmbeddingWithLoRA -from vllm_ascend.patch.worker.patch_common.patch_lora_linear import ( - AscendColumnParallelLinearWithLoRA, - AscendMergedColumnParallelLinearWithLoRA, AscendRowParallelLinearWithLoRA) - -_all_lora_classes.add(AscendRowParallelLinearWithLoRA) -_all_lora_classes.add(AscendColumnParallelLinearWithLoRA) -_all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA) -_all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) - -vllm.lora.utils._all_lora_classes = _all_lora_classes diff --git a/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py b/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py deleted file mode 100644 index eab545b..0000000 --- a/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional - -from torch import nn -from transformers import PretrainedConfig -from vllm.config import LoRAConfig -from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - -from vllm_ascend.ops.vocab_parallel_embedding import \ - AscendVocabParallelEmbedding - - -class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA): - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is AscendVocabParallelEmbedding diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index d4b2c4e..ee80c7d 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -270,7 +270,7 @@ class NPUPlatform(Platform): @classmethod def get_punica_wrapper(cls) -> str: - return "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU" + return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU" @classmethod def get_current_memory_usage(cls,