################################################################################ # Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ################################################################################ from typing import Optional import torch from fastcore.basics import patch_to from vllm.model_executor.models.bert import BertModel from vllm.sequence import IntermediateTensors @patch_to(BertModel) def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: if inputs_embeds is not None: hidden_states = inputs_embeds else: input_ids = input_ids.unsqueeze( 0 ) # Note: set input batch size (bs) to 1 here; otherwise attention module will raise an error. hidden_states = self.embeddings(input_ids=input_ids, position_ids=positions) hidden_states = self.encoder(hidden_states).squeeze(0) return hidden_states