Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -300,14 +300,26 @@ class Base(
            for child_name, child_module in module.named_children():
                new_module = child_module
                qual_name = maybe_prefix(prefix, child_name)
-                # Populate Eagle3 attrs
                if (
                    isinstance(module, nn.ModuleList)
                    and len(module) == self.text_config.num_hidden_layers
                ):
+                    # Populate Eagle3 attrs
                    self._target_class = type(child_module)
                    layer_name = qual_name.removeprefix("model.")
                    self._layer_names[int(child_name)] = layer_name
+                    # MTP weights should not be loaded into the base model
+                    num_hidden_layers = self.text_config.num_hidden_layers
+                    names = (
+                        "n_predict",  # Override from SpeculativeConfig
+                        "num_nextn_predict_layers",  # Most models
+                        "mtp_num_hidden_layers",  # Qwen 3.5
+                    )
+                    n_predict = getattr_iter(self.text_config, names, 0)
+                    for i in range(num_hidden_layers, num_hidden_layers + n_predict):
+                        mtp_prefix = f"{prefix}.{i}."
+                        if mtp_prefix not in self.ignore_unexpected_prefixes:
+                            self.ignore_unexpected_prefixes.append(mtp_prefix)
                # Replace modules as needed
                if isinstance(child_module, nn.Linear):
                    generator = (p for p in tp_plan if re.match(p, qual_name))
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -218,7 +218,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
            if "mm_token_type_ids" in processed_data
            else "token_type_ids"
        )
-        mm_token_type_ids = processed_data.pop(token_type_key)
+        mm_token_type_ids = processed_data.get(token_type_key)

        # We can infer vLLM style placeholder from token type ids, if we split
        # it for each input `mm_data`.
@@ -353,6 +353,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):

        num_image_patches = kwargs.pop("num_image_patches")
        kwargs.pop("token_type_ids", None)  # used only in `forward`
+        kwargs.pop("mm_token_type_ids", None)  # used only in `model.get_rope_index`

        if pixel_values is not None:
            # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
@@ -443,6 +444,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
            {
                "image_grid_thw",
                "video_grid_thw",
+                "mm_token_type_ids",
                "second_per_grid_ts",
                "audio_feature_lengths",
                "use_audio_in_video",
@@ -451,7 +453,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
        if any(
            v
            for k, v in kwargs.items()
-            if k not in {"image_grid_thw", "video_grid_thw"}
+            if k not in {"image_grid_thw", "mm_token_type_ids"}
        ):
            raise NotImplementedError(
                "Transformers modeling backend only supports images."
@@ -459,6 +461,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):

        image_grid_thw = kwargs.get("image_grid_thw", [])
        video_grid_thw = kwargs.get("video_grid_thw", [])
+        mm_token_type_ids = kwargs.get("mm_token_type_ids")

        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
            image_grid_thw
@@ -467,10 +470,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
            video_grid_thw
        )

+        # In v4 `get_rope_index` doesn't have wildcard `kwargs`, and
+        # can't accept arbitrary args, even if its value is `None`
+        kwargs = {}
+        if mm_token_type_ids:
+            if not hasattr(self, "_get_rope_index_accepts_mm_token_type_ids"):
+                import inspect
+
+                sig = inspect.signature(self.model.get_rope_index)
+                params = sig.parameters
+                self._get_rope_index_accepts_mm_token_type_ids = (
+                    "mm_token_type_ids" in params
+                    or any(
+                        p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+                    )
+                )
+            if self._get_rope_index_accepts_mm_token_type_ids:
+                kwargs["mm_token_type_ids"] = torch.cat(mm_token_type_ids)
+
        mrope_positions, mrope_position_delta = self.model.get_rope_index(
            input_ids=torch.tensor(input_tokens).unsqueeze(0),
            image_grid_thw=image_grid_thw,
            video_grid_thw=video_grid_thw,
+            **kwargs,
        )

        mrope_positions = mrope_positions[:, 0]