[Model] Support pooling models (#3122)
### What this PR does / why we need it? Support pooling models (like `bge-reranker-v2-m3`) in vllm-ascend, this pr covered the three model types of embed (cls_token, mean_token, lasttoken). After this [commit](17373dcd93), vllm has provided support for adapting pooling models on the v1 engine. This PR includes corresponding adaptations on the vllm-ascend side. Fixes #1960 - vLLM version: v0.12.0 - vLLM main:ad32e3e19c--------- Signed-off-by: lianyibo <lianyibo1@kunlunit.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -106,16 +106,7 @@
|
||||
#
|
||||
# ** File: worker/patch_roberta.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.model_executor.models.roberta.RobertaEmbedding.forward`
|
||||
# Why:
|
||||
# shift operation in `_encode_token_type_ids` and `_decode_token_type_ids` cannot run in ascend aclgraph mode
|
||||
# How:
|
||||
# Replace shift operation with multiplication and division.
|
||||
# Related PR (if no, explain why):
|
||||
# No, this need CANN add an aclnn shift operation
|
||||
# Future Plan:
|
||||
# Revert this when CANN support shift aclnn operation
|
||||
# 2. `vllm.model_executor.models.roberta.RobertaForSequenceClassification.forward `
|
||||
# 1. `vllm.model_executor.models.bert `
|
||||
# Why:
|
||||
# shift operation in `_encode_token_type_ids` and `_decode_token_type_ids` cannot run in ascend aclgraph mode
|
||||
# How:
|
||||
|
||||
@@ -22,9 +22,9 @@ if HAS_TRITON:
|
||||
|
||||
# isort: off
|
||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
import vllm_ascend.patch.worker.patch_bert # noqa
|
||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||
import vllm_ascend.patch.worker.patch_deepseek # noqa
|
||||
import vllm_ascend.patch.worker.patch_roberta # noqa
|
||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_ascend.patch.worker.patch_minicpm # noqa
|
||||
|
||||
45
vllm_ascend/patch/worker/patch_bert.py
Normal file
45
vllm_ascend/patch/worker/patch_bert.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.models import bert
|
||||
|
||||
# aclgraph does not support shift operator for now
|
||||
# TODO: revert me when aclgraph supports shift operator
|
||||
TOKEN_TYPE_SHIFT = 30
|
||||
TOKEN_TYPE_MULTIPLIER = 1 << 30
|
||||
TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
|
||||
|
||||
|
||||
def _encode_token_type_ids(input_ids: torch.Tensor,
|
||||
token_type_ids: torch.Tensor) -> None:
|
||||
# input_ids can be padded to the right
|
||||
input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
|
||||
TOKEN_TYPE_MULTIPLIER)
|
||||
|
||||
|
||||
def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
|
||||
|
||||
input_ids.bitwise_and_(TOKEN_MASK)
|
||||
|
||||
return token_type_ids
|
||||
|
||||
|
||||
bert._encode_token_type_ids = _encode_token_type_ids
|
||||
bert._decode_token_type_ids = _decode_token_type_ids
|
||||
@@ -1,91 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.models.roberta import (
|
||||
RobertaEmbedding, RobertaForSequenceClassification,
|
||||
replace_roberta_positions)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
# aclgraph does not support shift operator for now
|
||||
# TODO: revert me when aclgraph supports shift operator
|
||||
TOKEN_TYPE_SHIFT = 30
|
||||
TOKEN_TYPE_MULTIPLIER = 1 << 30
|
||||
TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
|
||||
|
||||
|
||||
def _encode_token_type_ids(input_ids: torch.Tensor,
|
||||
token_type_ids: torch.Tensor) -> None:
|
||||
# input_ids can be padded to the right
|
||||
input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
|
||||
TOKEN_TYPE_MULTIPLIER)
|
||||
|
||||
|
||||
def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
|
||||
|
||||
input_ids.bitwise_and_(TOKEN_MASK)
|
||||
|
||||
return token_type_ids
|
||||
|
||||
|
||||
def roberta_for_sequence_classification_forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
replace_roberta_positions(input_ids=input_ids,
|
||||
position_ids=positions,
|
||||
padding_idx=self.padding_idx)
|
||||
if token_type_ids is not None:
|
||||
assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
|
||||
assert input_ids is not None
|
||||
_encode_token_type_ids(input_ids, token_type_ids)
|
||||
return self.roberta(input_ids=input_ids,
|
||||
positions=positions,
|
||||
inputs_embeds=inputs_embeds,
|
||||
intermediate_tensors=intermediate_tensors)
|
||||
|
||||
|
||||
def roberta_embedding_forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
position_ids: torch.Tensor,
|
||||
inputs_embeds: Union[torch.Tensor, None] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
token_type_ids = _decode_token_type_ids(input_ids)
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
||||
embeddings = self.LayerNorm(embeddings)
|
||||
return embeddings
|
||||
|
||||
|
||||
RobertaEmbedding.forward = roberta_embedding_forward
|
||||
RobertaForSequenceClassification.forward = roberta_for_sequence_classification_forward
|
||||
Reference in New Issue
Block a user