[Misc] Move lora patch file into lora module (#2797)

Cleanup useless file in patch module. Update the lora support list is OK
in vLLM Ascend, no need to patch vLLM


- vLLM version: v0.10.1.1
- vLLM main:
f4962a6d55

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-09-08 21:42:12 +08:00
committed by GitHub
parent 85d989a3b9
commit 7d6d9449a8
10 changed files with 64 additions and 72 deletions

View File

@@ -559,9 +559,8 @@ class TestNPUPlatform(TestBase):
def test_get_punica_wrapper(self): def test_get_punica_wrapper(self):
result = self.platform.get_punica_wrapper() result = self.platform.get_punica_wrapper()
self.assertEqual( self.assertEqual(result,
result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU")
"vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU")
@patch("torch.npu.reset_peak_memory_stats") @patch("torch.npu.reset_peak_memory_stats")
@patch("torch.npu.max_memory_allocated") @patch("torch.npu.max_memory_allocated")

View File

@@ -11,12 +11,14 @@ if is_310p():
bgmv_shrink, sgmv_expand, bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink) sgmv_expand_slice, sgmv_shrink)
else: else:
from vllm_ascend.lora.punica_wrapper.lora_ops import ( from vllm_ascend.lora.lora_ops import (bgmv_expand, bgmv_expand_slice,
bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink) sgmv_expand_slice, sgmv_shrink)
from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
from vllm_ascend.lora.utils import refresh_all_lora_classes
# The platforms that are compatible with the PyTorch-native implementation can # The platforms that are compatible with the PyTorch-native implementation can
# inherit this class # inherit this class
@@ -31,6 +33,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
device: Union[torch.device, str], **kwargs): device: Union[torch.device, str], **kwargs):
PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
device) device)
refresh_all_lora_classes()
def _shrink_prefill( def _shrink_prefill(
self, self,

View File

@@ -1,28 +1,19 @@
from typing import Optional from typing import Optional
import vllm
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA, from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA) RowParallelLinearWithLoRA,
VocabParallelEmbeddingWithLoRA)
from vllm_ascend.ops.linear import (AscendColumnParallelLinear, from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
AscendMergedColumnParallelLinear, AscendMergedColumnParallelLinear,
AscendRowParallelLinear) AscendRowParallelLinear)
from vllm_ascend.ops.vocab_parallel_embedding import \
AscendVocabParallelEmbedding
class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA):
@classmethod
def can_replace_layer(
cls,
source_layer: nn.Module,
lora_config: LoRAConfig,
packed_modules_list: list,
model_config: Optional[PretrainedConfig],
) -> bool:
return type(source_layer) is AscendRowParallelLinear
class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
@@ -50,3 +41,37 @@ class AscendMergedColumnParallelLinearWithLoRA(
model_config: Optional[PretrainedConfig], model_config: Optional[PretrainedConfig],
) -> bool: ) -> bool:
return type(source_layer) is AscendMergedColumnParallelLinear return type(source_layer) is AscendMergedColumnParallelLinear
class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA):
@classmethod
def can_replace_layer(
cls,
source_layer: nn.Module,
lora_config: LoRAConfig,
packed_modules_list: list,
model_config: Optional[PretrainedConfig],
) -> bool:
return type(source_layer) is AscendRowParallelLinear
class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
@classmethod
def can_replace_layer(
cls,
source_layer: nn.Module,
lora_config: LoRAConfig,
packed_modules_list: list,
model_config: Optional[PretrainedConfig],
) -> bool:
return type(source_layer) is AscendVocabParallelEmbedding
def refresh_all_lora_classes():
vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA)
vllm.lora.utils._all_lora_classes.add(
AscendMergedColumnParallelLinearWithLoRA)
vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA)
vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)

View File

@@ -46,6 +46,15 @@
# Need a PR to vllm to support get port from environment. # Need a PR to vllm to support get port from environment.
# Future Plan: # Future Plan:
# Remove those patch when vllm merged them # Remove those patch when vllm merged them
# 2. `torch.distributed.all_reduce`, `torch.distributed.broadcast`
# Why:
# tensor alignment for 310p
# How
# rewrite all_reduce and broadcast in torch.distributed
# Related PR (if no, explain why):
# No, not ready yet.
# Future Plan:
# Find a better way to support tensor alignment for 310p without this patch.
# #
# * Worker Patch: # * Worker Patch:
# =============== # ===============
@@ -86,19 +95,15 @@
# - https://github.com/vllm-project/vllm/pull/21591 # - https://github.com/vllm-project/vllm/pull/21591
# Future Plan: # Future Plan:
# Revert it when vLLM merge #21591 and release new version # Revert it when vLLM merge #21591 and release new version
# ** File: worker/patch_common/patch_linear.py ** # ** File: worker/patch_common/patch_logits.py **
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.layers.linear.RowParallelLinear` # 1. `vllm._custom_ops.apply_repetition_penalties`
# Why: # Why:
# We need to fuse matmul and allreuce in `RowParallelLinear` # apply_repetition_penalties in vLLM use tensor.is_cuda to check if tensor is on cuda. But the value is always True
# to improve performance. # on ascend, thus we need to patch apply_repetition_penalties.
# How # How
# Create a new class `AscendRowParallelLinear` that inherits from `RowParallelLinear`. # Remove the related cuda check in apply_repetition_penalties.
# In this class, we override the `forward` method to use
# torch_npu.npu_mm_all_reduce_base to replace matmul and allreduce.
# Related PR (if no, explain why): # Related PR (if no, explain why):
# - https://github.com/vllm-project/vllm-ascend/pull/1926 # - this is a bug by Ascend only. It can' be fixed in vLLM.
# Future Plan: # Future Plan:
# Validate more models in all kinds of scenario, # Fix this bug in torch-npu, bump torch-npu version and remove this patch.
# if performance is always improved, we can enable this patch by default and remove the env
# variable `VLLM_ASCEND_ENABLE_FUSE_MATMUL_ALLREDUCE` in the future.

View File

@@ -17,7 +17,4 @@
import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa
import vllm_ascend.patch.worker.patch_common.patch_logits # noqa import vllm_ascend.patch.worker.patch_common.patch_logits # noqa
import vllm_ascend.patch.worker.patch_common.patch_lora # noqa
import vllm_ascend.patch.worker.patch_common.patch_lora_embedding # noqa
import vllm_ascend.patch.worker.patch_common.patch_lora_linear # noqa
import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa

View File

@@ -1,15 +0,0 @@
import vllm
from vllm.lora.utils import _all_lora_classes
from vllm_ascend.patch.worker.patch_common.patch_lora_embedding import \
AscendVocabParallelEmbeddingWithLoRA
from vllm_ascend.patch.worker.patch_common.patch_lora_linear import (
AscendColumnParallelLinearWithLoRA,
AscendMergedColumnParallelLinearWithLoRA, AscendRowParallelLinearWithLoRA)
_all_lora_classes.add(AscendRowParallelLinearWithLoRA)
_all_lora_classes.add(AscendColumnParallelLinearWithLoRA)
_all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA)
_all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)
vllm.lora.utils._all_lora_classes = _all_lora_classes

View File

@@ -1,22 +0,0 @@
from typing import Optional
from torch import nn
from transformers import PretrainedConfig
from vllm.config import LoRAConfig
from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
from vllm_ascend.ops.vocab_parallel_embedding import \
AscendVocabParallelEmbedding
class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
@classmethod
def can_replace_layer(
cls,
source_layer: nn.Module,
lora_config: LoRAConfig,
packed_modules_list: list,
model_config: Optional[PretrainedConfig],
) -> bool:
return type(source_layer) is AscendVocabParallelEmbedding

View File

@@ -270,7 +270,7 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def get_punica_wrapper(cls) -> str: def get_punica_wrapper(cls) -> str:
return "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU" return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU"
@classmethod @classmethod
def get_current_memory_usage(cls, def get_current_memory_usage(cls,