[Model] Support pooling models (#3122)

### What this PR does / why we need it? Support pooling models (like `bge-reranker-v2-m3`) in vllm-ascend, this pr covered the three model types of embed (cls_token, mean_token, lasttoken). After this [commit](17373dcd93), vllm has provided support for adapting pooling models on the v1 engine. This PR includes corresponding adaptations on the vllm-ascend side. Fixes #1960 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: lianyibo <lianyibo1@kunlunit.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-12-10 11:37:57 +08:00
parent 1a7a34c5ec
commit e32014ac1d
17 changed files with 577 additions and 338 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -106,16 +106,7 @@
 #
 # ** File: worker/patch_roberta.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.models.roberta.RobertaEmbedding.forward`
-#    Why:
-#       shift operation in `_encode_token_type_ids` and `_decode_token_type_ids` cannot run in ascend aclgraph mode
-#    How：
-#       Replace shift operation with multiplication and division.
-#    Related PR (if no, explain why):
-#       No, this need CANN add an aclnn shift operation
-#    Future Plan:
-#       Revert this when CANN support shift aclnn operation
-#   2. `vllm.model_executor.models.roberta.RobertaForSequenceClassification.forward `
+#   1. `vllm.model_executor.models.bert `
 #    Why:
 #       shift operation in `_encode_token_type_ids` and `_decode_token_type_ids` cannot run in ascend aclgraph mode
 #    How：
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -22,9 +22,9 @@ if HAS_TRITON:

 # isort: off
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
+import vllm_ascend.patch.worker.patch_bert  # noqa
 import vllm_ascend.patch.worker.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_deepseek  # noqa
-import vllm_ascend.patch.worker.patch_roberta  # noqa
 import vllm_ascend.patch.worker.patch_weight_loader  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
 import vllm_ascend.patch.worker.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_bert.py
+++ b/vllm_ascend/patch/worker/patch_bert.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.model_executor.models import bert
+
+# aclgraph does not support shift operator for now
+# TODO: revert me when aclgraph supports shift operator
+TOKEN_TYPE_SHIFT = 30
+TOKEN_TYPE_MULTIPLIER = 1 << 30
+TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
+
+
+def _encode_token_type_ids(input_ids: torch.Tensor,
+                           token_type_ids: torch.Tensor) -> None:
+    # input_ids can be padded to the right
+    input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
+                                                    TOKEN_TYPE_MULTIPLIER)
+
+
+def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
+
+    token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
+
+    input_ids.bitwise_and_(TOKEN_MASK)
+
+    return token_type_ids
+
+
+bert._encode_token_type_ids = _encode_token_type_ids
+bert._decode_token_type_ids = _decode_token_type_ids
--- a/vllm_ascend/patch/worker/patch_roberta.py
+++ b/vllm_ascend/patch/worker/patch_roberta.py
@@ -1,91 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Optional, Union
-
-import torch
-from vllm.model_executor.models.roberta import (
-    RobertaEmbedding, RobertaForSequenceClassification,
-    replace_roberta_positions)
-from vllm.sequence import IntermediateTensors
-
-# aclgraph does not support shift operator for now
-# TODO: revert me when aclgraph supports shift operator
-TOKEN_TYPE_SHIFT = 30
-TOKEN_TYPE_MULTIPLIER = 1 << 30
-TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
-
-
-def _encode_token_type_ids(input_ids: torch.Tensor,
-                           token_type_ids: torch.Tensor) -> None:
-    # input_ids can be padded to the right
-    input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
-                                                    TOKEN_TYPE_MULTIPLIER)
-
-
-def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
-
-    token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
-
-    input_ids.bitwise_and_(TOKEN_MASK)
-
-    return token_type_ids
-
-
-def roberta_for_sequence_classification_forward(
-    self,
-    input_ids: Optional[torch.Tensor],
-    positions: torch.Tensor,
-    intermediate_tensors: Optional[IntermediateTensors] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
-    token_type_ids: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    replace_roberta_positions(input_ids=input_ids,
-                              position_ids=positions,
-                              padding_idx=self.padding_idx)
-    if token_type_ids is not None:
-        assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
-        assert input_ids is not None
-        _encode_token_type_ids(input_ids, token_type_ids)
-    return self.roberta(input_ids=input_ids,
-                        positions=positions,
-                        inputs_embeds=inputs_embeds,
-                        intermediate_tensors=intermediate_tensors)
-
-
-def roberta_embedding_forward(
-    self,
-    input_ids: torch.Tensor,
-    position_ids: torch.Tensor,
-    inputs_embeds: Union[torch.Tensor, None] = None,
-) -> torch.Tensor:
-
-    token_type_ids = _decode_token_type_ids(input_ids)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.word_embeddings(input_ids)
-
-    position_embeddings = self.position_embeddings(position_ids)
-
-    token_type_embeddings = self.token_type_embeddings(token_type_ids)
-    embeddings = inputs_embeds + token_type_embeddings + position_embeddings
-    embeddings = self.LayerNorm(embeddings)
-    return embeddings
-
-
-RobertaEmbedding.forward = roberta_embedding_forward
-RobertaForSequenceClassification.forward = roberta_for_sequence_classification_forward