diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 575d3acf..1b346de6 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -104,29 +104,7 @@ # Future Plan: # Remove this patch when vllm merged them. # -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs` -# Why: -# We need to patch gather_logprobs to make sure call batched_count_greater_than -# with backend=current_platform.simple_compile_backend -# How: -# Patch gather_logprobs call new batched_count_greater_than -# Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm/pull/21591 -# Future Plan: -# Revert it when vLLM merge #21591 and release new version -# ** File: worker/patch_logits.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm._custom_ops.apply_repetition_penalties` -# Why: -# apply_repetition_penalties in vLLM use tensor.is_cuda to check if tensor is on cuda. But the value is always True -# on ascend, thus we need to patch apply_repetition_penalties. -# How: -# Remove the related cuda check in apply_repetition_penalties. -# Related PR (if no, explain why): -# - this is a bug by Ascend only. It can' be fixed in vLLM. -# Future Plan: -# Fix this bug in torch-npu, bump torch-npu version and remove this patch. +# ** File: worker/patch_roberta.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.roberta.RobertaEmbedding.forward` # Why: diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 43a2e800..a361789f 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -23,7 +23,6 @@ if HAS_TRITON: # isort: off import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.worker.patch_distributed # noqa -import vllm_ascend.patch.worker.patch_logits # noqa import vllm_ascend.patch.worker.patch_roberta # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa diff --git a/vllm_ascend/patch/worker/patch_logits.py b/vllm_ascend/patch/worker/patch_logits.py deleted file mode 100644 index 84a92f91..00000000 --- a/vllm_ascend/patch/worker/patch_logits.py +++ /dev/null @@ -1,26 +0,0 @@ -import torch -import vllm -from vllm._custom_ops import apply_repetition_penalties_torch - - -def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, - output_mask: torch.Tensor, - repetition_penalties: torch.Tensor) -> None: - """Apply repetition penalties to logits in-place. - - Args: - logits: The logits tensor of shape [num_seqs, vocab_size]. - prompt_mask: A boolean tensor indicating which tokens appear in the prompt. - output_mask: A boolean tensor indicating which tokens appear in the output. - repetition_penalties: The repetition penalties of shape (num_seqs, ). - """ - apply_repetition_penalties_torch(logits, prompt_mask, output_mask, - repetition_penalties) - - -# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in -# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74 -# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and -# call the custom op implemented in CUDA, which is not compatible with NPU. -# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314 -vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties