[misc] Remove useless patch_logits (#4252)
Torch-npu 2.7.1 has fixed the device check bug. This patch can be
removed now.
- vLLM main:
2918c1b49c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -104,29 +104,7 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Remove this patch when vllm merged them.
|
# Remove this patch when vllm merged them.
|
||||||
#
|
#
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ** File: worker/patch_roberta.py **
|
||||||
# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
|
|
||||||
# Why:
|
|
||||||
# We need to patch gather_logprobs to make sure call batched_count_greater_than
|
|
||||||
# with backend=current_platform.simple_compile_backend
|
|
||||||
# How:
|
|
||||||
# Patch gather_logprobs call new batched_count_greater_than
|
|
||||||
# Related PR (if no, explain why):
|
|
||||||
# - https://github.com/vllm-project/vllm/pull/21591
|
|
||||||
# Future Plan:
|
|
||||||
# Revert it when vLLM merge #21591 and release new version
|
|
||||||
# ** File: worker/patch_logits.py **
|
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
# 1. `vllm._custom_ops.apply_repetition_penalties`
|
|
||||||
# Why:
|
|
||||||
# apply_repetition_penalties in vLLM use tensor.is_cuda to check if tensor is on cuda. But the value is always True
|
|
||||||
# on ascend, thus we need to patch apply_repetition_penalties.
|
|
||||||
# How:
|
|
||||||
# Remove the related cuda check in apply_repetition_penalties.
|
|
||||||
# Related PR (if no, explain why):
|
|
||||||
# - this is a bug by Ascend only. It can' be fixed in vLLM.
|
|
||||||
# Future Plan:
|
|
||||||
# Fix this bug in torch-npu, bump torch-npu version and remove this patch.
|
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.model_executor.models.roberta.RobertaEmbedding.forward`
|
# 1. `vllm.model_executor.models.roberta.RobertaEmbedding.forward`
|
||||||
# Why:
|
# Why:
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ if HAS_TRITON:
|
|||||||
# isort: off
|
# isort: off
|
||||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||||
import vllm_ascend.patch.worker.patch_logits # noqa
|
|
||||||
import vllm_ascend.patch.worker.patch_roberta # noqa
|
import vllm_ascend.patch.worker.patch_roberta # noqa
|
||||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
||||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
import torch
|
|
||||||
import vllm
|
|
||||||
from vllm._custom_ops import apply_repetition_penalties_torch
|
|
||||||
|
|
||||||
|
|
||||||
def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
|
|
||||||
output_mask: torch.Tensor,
|
|
||||||
repetition_penalties: torch.Tensor) -> None:
|
|
||||||
"""Apply repetition penalties to logits in-place.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
logits: The logits tensor of shape [num_seqs, vocab_size].
|
|
||||||
prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
|
|
||||||
output_mask: A boolean tensor indicating which tokens appear in the output.
|
|
||||||
repetition_penalties: The repetition penalties of shape (num_seqs, ).
|
|
||||||
"""
|
|
||||||
apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
|
|
||||||
repetition_penalties)
|
|
||||||
|
|
||||||
|
|
||||||
# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in
|
|
||||||
# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74
|
|
||||||
# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and
|
|
||||||
# call the custom op implemented in CUDA, which is not compatible with NPU.
|
|
||||||
# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314
|
|
||||||
vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties
|
|
||||||
Reference in New Issue
Block a user