From cf96366a396b7d70b390cb244b53c58e9666c4d5 Mon Sep 17 00:00:00 2001 From: yupeng <507435917@qq.com> Date: Thu, 28 Aug 2025 10:40:51 +0800 Subject: [PATCH] [Bugfix][LoRA][Patch] Fix the LoRA inference bug after upstream vLLM codebase changed (#2560) ### What this PR does / why we need it? The mergence of the upstream https://github.com/vllm-project/vllm/pull/22592 caused a vllm-ascend LoRA inference bug. The details are following: According to [torch_npu/npu/_stream_check.py](https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74), NPU device type tensors have attributes is_cuda=True and is_npu=True. This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda and logits.is_contiguous()" and call the custom op implemented in CUDA, which is not compatible with NPU. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? pytest -sv tests/e2e/singlecard/test_ilama_lora.py pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/fe8d7b6f03e7d8a36ffb6931397fc81ee594dd64 --------- Signed-off-by: paulyu12 Signed-off-by: paulyu12 <507435917@qq.com> Co-authored-by: paulyu12 --- .../patch/worker/patch_common/__init__.py | 1 + .../patch/worker/patch_common/patch_logits.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 vllm_ascend/patch/worker/patch_common/patch_logits.py diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index 78b6fcd..deb8fe7 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -17,4 +17,5 @@ import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_linear # noqa +import vllm_ascend.patch.worker.patch_common.patch_logits # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_logits.py b/vllm_ascend/patch/worker/patch_common/patch_logits.py new file mode 100644 index 0000000..84a92f9 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py @@ -0,0 +1,26 @@ +import torch +import vllm +from vllm._custom_ops import apply_repetition_penalties_torch + + +def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor) -> None: + """Apply repetition penalties to logits in-place. + + Args: + logits: The logits tensor of shape [num_seqs, vocab_size]. + prompt_mask: A boolean tensor indicating which tokens appear in the prompt. + output_mask: A boolean tensor indicating which tokens appear in the output. + repetition_penalties: The repetition penalties of shape (num_seqs, ). + """ + apply_repetition_penalties_torch(logits, prompt_mask, output_mask, + repetition_penalties) + + +# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in +# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74 +# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and +# call the custom op implemented in CUDA, which is not compatible with NPU. +# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314 +vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties