enginex-biren-vllm/vllm_br/model_executor/models/roberta.py

################################################################################
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

# SPDX-License-Identifier: Apache-2.0

from typing import Optional

import torch
# Adapted from transformers
from fastcore.basics import patch_to

import vllm
from vllm.model_executor.models.roberta import (
    create_position_ids_from_input_ids)


@patch_to(vllm.model_executor.models.roberta.RobertaClassificationHead)
def forward(self, features, **kwargs):
    x = features[0, :]  # take <s> token (equiv. to [CLS])
    x = x.unsqueeze(0)  # add batch dimension
    x = self.dense(x)
    x = torch.tanh(x)
    x = self.out_proj(x)
    x = x.squeeze(0)  # remove batch dimension
    return x


@patch_to(vllm.model_executor.models.roberta.RobertaEmbedding)
def forward(
    self,
    input_ids: torch.Tensor,
    seq_lens: torch.Tensor,
    position_ids: torch.Tensor,
    token_type_ids: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    input_ids = input_ids.squeeze(0)  # notice here input_ids is 2-dim tensor
    input_shape = input_ids.size()
    inputs_embeds = self.word_embeddings(input_ids)

    # Replace position ids because in RoBERTa models
    # they have to start at padding_idx + 1 and ignore
    # existing padding tokens
    # References:
    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
    pos_list = []
    token_list = []
    offset = 0
    for seq_len in seq_lens:
        pos_list.append(position_ids[offset:offset + seq_len])
        token_list.append(input_ids[offset:offset + seq_len])
        offset += seq_len

    new_pos_list = []
    for positions, tokens in zip(pos_list, token_list, strict=False):
        # Verify assumption that incoming position are
        # always a sequence from 0 to N.
        expected_pos = torch.arange(positions.size()[0],
                                    dtype=torch.long,
                                    device=inputs_embeds.device)
        assert torch.equal(positions, expected_pos)
        new_pos_list.append(
            create_position_ids_from_input_ids(tokens, self.padding_idx))
    position_ids = torch.cat(new_pos_list)

    # Position embeddings.
    position_embeddings = self.position_embeddings(position_ids)
    if token_type_ids is None:
        token_type_ids = torch.zeros(input_shape,
                                     dtype=torch.long,
                                     device=inputs_embeds.device)

    token_type_embeddings = self.token_type_embeddings(token_type_ids)
    embeddings = inputs_embeds + token_type_embeddings + position_embeddings
    embeddings = self.LayerNorm(embeddings)
    return embeddings.unsqueeze(0)  # add batch dimension for BR attention