[Model][VLM][Patch]Modify ascend affinity _merge_multimodal_embeddings (#3071)

### What this PR does / why we need it? This PR aims to address the incompatibility of the `.masked_scatter_` operation in the current `_merge_multimodal_embeddings` function on Ascend. For now, it reverts to the previous version of the CPU operation, which can be executed asynchronously on the device side to enhance performance. - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: booker123456 <945658361@qq.com>
2025-09-24 10:25:28 +08:00
parent b1380f3b87
commit c4b976af1a
3 changed files with 71 additions and 0 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -56,6 +56,18 @@
 #    Future Plan:
 #       Find a better way to support tensor alignment for 310p without this patch.
 #
+# ** File: platform/patch_common/patch_multimodal_merge.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.model_executor.models.utils._merge_multimodal_embeddings`
+#    Why:
+#       '_merge_multimodal_embeddings' func of vllm is incompatible with Ascend.
+#    How：
+#       Replace with CPU operation that can be executed asynchronously.
+#    Related PR (if no, explain why):
+#       This is a bug by Ascend only. It can' be fixed in vLLM.
+#    Future Plan:
+#       Identify this pattern in torch-npu and remove this patch.
+#
 # * Worker Patch:
 # ===============
 # ** File: worker/patch_common/patch_minicpm.py **
--- a/vllm_ascend/patch/platform/patch_common/init.py
+++ b/vllm_ascend/patch/platform/patch_common/init.py
@@ -17,3 +17,4 @@

 import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_common.patch_mamba_config  # noqa
+import vllm_ascend.patch.platform.patch_common.patch_multimodal_merge  # noqa
--- a/vllm_ascend/patch/platform/patch_common/patch_multimodal_merge.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_multimodal_merge.py
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import torch
+import vllm
+from vllm.model_executor.models.utils import (_embedding_count_expression,
+                                              _flatten_embeddings)
+from vllm.multimodal import NestedTensors
+
+
+def _merge_multimodal_embeddings(
+    inputs_embeds: torch.Tensor,
+    is_multimodal: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    flattened = _flatten_embeddings(multimodal_embeddings)
+    try:
+        inputs_embeds[is_multimodal] = flattened
+    except RuntimeError as e:
+        num_expected_tokens = is_multimodal.sum().item()
+        assert isinstance(num_expected_tokens, int)
+
+        if flattened.shape[0] != num_expected_tokens:
+            expr = _embedding_count_expression(multimodal_embeddings)
+            raise ValueError(
+                f"Attempted to assign {expr} = {flattened.shape[0]} "
+                f"multimodal tokens to {num_expected_tokens} placeholders"
+            ) from e
+        else:
+            raise ValueError("Error during masked scatter operation") from e
+
+    return inputs_embeds
+
+
+vllm.model_executor.models.utils._merge_multimodal_embeddings = _merge_multimodal_embeddings