From ded24f50264a889c808971636e7a7d4206ae5563 Mon Sep 17 00:00:00 2001
From: Joeegin <80816296+Joeegin@users.noreply.github.com>
Date: Sun, 4 Jan 2026 16:38:05 +0800
Subject: [PATCH] [Model] Supporet  InternVL2_5 on v0.11.0 (#72)

Co-authored-by: v_qiaoyijin <v_qiaoyijin@baidu.com>
---
 vllm_kunlun/models/intern_vit.py | 35 ++++++++++++---
 vllm_kunlun/models/internlm2.py  | 32 +++++++-------
 vllm_kunlun/models/internvl.py   | 76 ++++++++++++++++++++++----------
 3 files changed, 97 insertions(+), 46 deletions(-)

diff --git a/vllm_kunlun/models/intern_vit.py b/vllm_kunlun/models/intern_vit.py
index e04f284..19604cd 100644
--- a/vllm_kunlun/models/intern_vit.py
+++ b/vllm_kunlun/models/intern_vit.py
@@ -29,6 +29,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from vllm.model_executor.models.vision import run_dp_sharded_vision_model
+
 NORM2FN = {
     'rms_norm': RMSNorm,
     'layer_norm': nn.LayerNorm,
@@ -137,6 +139,7 @@ class InternParallelAttention(nn.Module):
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -166,6 +169,7 @@ class InternParallelAttention(nn.Module):
             bias=config.qkv_bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
         )
 
         self.qk_normalization = config.qk_normalization
@@ -183,6 +187,7 @@ class InternParallelAttention(nn.Module):
             self.embed_dim,
             quant_config=quant_config,
             prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
         )
 
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
@@ -286,6 +291,7 @@ class InternMLP(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -295,12 +301,14 @@ class InternMLP(nn.Module):
                                         config.intermediate_size,
                                         bias=True,
                                         quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
+                                        prefix=f"{prefix}.fc1",
+                                        disable_tp=use_data_parallel)
         self.fc2 = RowParallelLinear(config.intermediate_size,
                                      config.hidden_size,
                                      bias=True,
                                      quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+                                     prefix=f"{prefix}.fc2",
+                                     disable_tp=use_data_parallel)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -319,6 +327,7 @@ class InternVisionEncoderLayer(nn.Module):
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -329,11 +338,13 @@ class InternVisionEncoderLayer(nn.Module):
         self.attn = self._init_attn(config,
                                     quant_config,
                                     num_dummy_heads=num_dummy_heads,
-                                    prefix=f"{prefix}.attn")
+                                    prefix=f"{prefix}.attn",
+                                    use_data_parallel=use_data_parallel)
 
         self.mlp = InternMLP(config,
                              quant_config=quant_config,
-                             prefix=f"{prefix}.mlp")
+                             prefix=f"{prefix}.mlp",
+                             use_data_parallel=use_data_parallel)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
         self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
@@ -351,6 +362,7 @@ class InternVisionEncoderLayer(nn.Module):
         *,
         num_dummy_heads: int,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         # fallback to sdpa attention if tp unavailable
         tp_size = get_tensor_model_parallel_world_size()
@@ -387,6 +399,7 @@ class InternVisionEncoder(nn.Module):
         num_hidden_layers_override: Optional[int] = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
 
@@ -401,7 +414,8 @@ class InternVisionEncoder(nn.Module):
             InternVisionEncoderLayer(config,
                                      quant_config,
                                      num_dummy_heads=num_dummy_heads,
-                                     prefix=f"{prefix}.layers.{layer_idx}")
+                                     prefix=f"{prefix}.layers.{layer_idx}",
+                                     use_data_parallel=use_data_parallel)
             for layer_idx in range(num_hidden_layers)
         ])
 
@@ -428,10 +442,12 @@ class InternVisionModel(nn.Module):
         num_hidden_layers_override: Optional[int] = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         self.config = config
+        self.use_data_parallel = use_data_parallel
 
         self.embeddings = InternVisionEmbeddings(config)
         self.encoder = InternVisionEncoder(
@@ -440,6 +456,7 @@ class InternVisionModel(nn.Module):
             num_hidden_layers_override=num_hidden_layers_override,
             num_dummy_heads=num_dummy_heads,
             prefix=f"{prefix}.encoder",
+            use_data_parallel=use_data_parallel,
         )
 
     def get_input_embeddings(self):
@@ -463,7 +480,11 @@ class InternVisionModel(nn.Module):
                 raise ValueError(
                     f'wrong pixel_values size: {pixel_values.shape}')
 
-        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        if self.use_data_parallel:
+            encoder_outputs = run_dp_sharded_vision_model(
+                hidden_states, self.encoder)
+        else:
+            encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         return encoder_outputs
 
@@ -477,4 +498,4 @@ class InternVisionModel(nn.Module):
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
             loaded_params.add(name)
-        return loaded_params
+        return loaded_params
\ No newline at end of file
diff --git a/vllm_kunlun/models/internlm2.py b/vllm_kunlun/models/internlm2.py
index 7f1c241..13019ac 100644
--- a/vllm_kunlun/models/internlm2.py
+++ b/vllm_kunlun/models/internlm2.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -30,10 +31,10 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.interfaces_base import default_pooling_type
 from vllm.model_executor.models.utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -298,7 +299,7 @@ class InternLM2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -358,10 +359,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.output, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(self.output, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str,
@@ -423,13 +422,15 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
             delattr(self, attr)
 
         config = vllm_config.model_config.hf_config
-        self.v_head = RowParallelLinear(
-            config.hidden_size,
-            1,
-            bias=False,
-            input_is_parallel=False,
-            prefix=maybe_prefix(prefix, "v_head"),
-        )
+        self.head_dtype = vllm_config.model_config.head_dtype
+
+        self.v_head = RowParallelLinear(config.hidden_size,
+                                        1,
+                                        bias=False,
+                                        input_is_parallel=False,
+                                        params_dtype=self.head_dtype,
+                                        prefix=maybe_prefix(prefix, "v_head"),
+                                        return_bias=False)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
@@ -446,5 +447,6 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
-        logits, _ = self.v_head(hidden_states)
-        return logits
+        hidden_states = hidden_states.to(self.head_dtype)
+        logits = self.v_head(hidden_states)
+        return logits
\ No newline at end of file
diff --git a/vllm_kunlun/models/internvl.py b/vllm_kunlun/models/internvl.py
index 6ec825c..de5eece 100644
--- a/vllm_kunlun/models/internvl.py
+++ b/vllm_kunlun/models/internvl.py
@@ -7,6 +7,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Any, Literal, Optional, TypeVar, Union
@@ -21,13 +22,13 @@ from transformers import BatchEncoding, PretrainedConfig, TensorType
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from .intern_vit import (InternVisionModel, InternVisionPatchModel)
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -36,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import set_default_torch_num_threads
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -114,13 +116,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs,
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose([
+    transform = T.Compose([
         T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
         T.Resize((input_size, input_size),
                  interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
         T.Normalize(mean=MEAN, std=STD)
     ])
+    # Image transformation operations (which include tensor computations
+    # on the CPU) can occupy a substantial number of CPU cores, introducing
+    # overhead due to CPU contention. This issue becomes particularly
+    # noticeable when deploying multiple vLLM instances on a single machine.
+    # Therefore, it is necessary to limit the number of threads allocated to
+    # image transformation tasks.
+    num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
+
+    def apply(img):
+        with set_default_torch_num_threads(num_threads):
+            return transform(img)
+
+    return apply
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
@@ -796,18 +811,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
@@ -853,9 +869,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_video_token(self) -> Optional[str]:
         text_model_type = self.get_hf_config().get_text_config().model_type
-        if text_model_type == "qwen2":
-            return "<|video_pad|>"
-        return None
+        video_token_map = {
+            "qwen2": "<|video_pad|>",
+            "qwen3": "<|video_pad|>",
+            "qwen3_moe": "<|video_pad|>",
+            "gpt_oss": "<|reserved_200000|>",
+        }
+        return video_token_map.get(text_model_type)
 
     def get_num_frames_with_most_features(
         self,
@@ -965,15 +985,19 @@ class InternVLMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
-            mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
+        prompt_repl = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "video_num_patches" in out_mm_kwargs:
-            video_num_patches = out_mm_kwargs["video_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
             assert isinstance(video_num_patches, torch.Tensor)
             video_num_patches = video_num_patches.tolist()
         else:
@@ -991,12 +1015,15 @@ class InternVLMultiModalProcessor(
                 video_context_token=hf_processor.video_token)
 
         if self.info.supports_video:
-            prompt_repl.append(
+            prompt_repl = [
+                *prompt_repl,
                 PromptReplacement(
                     modality="video",
                     target="<video>",
                     replacement=get_video_replacement_internvl,
-                ))
+                )
+            ]
+
         return prompt_repl
 
 
@@ -1007,6 +1034,8 @@ class InternVLMultiModalProcessor(
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                         SupportsLoRA):
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -1025,6 +1054,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self._patch_quant_config(config, quant_config)
 
         image_size = config.force_image_size or config.vision_config.image_size
@@ -1092,7 +1122,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                 quant_config=quant_config,
                 num_hidden_layers_override=num_hidden_layers,
                 prefix=prefix,
-            )
+                use_data_parallel=self.use_data_parallel)
         else:
             return InternVisionPatchModel(config.vision_config)
 
@@ -1368,10 +1398,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        return self.language_model.compute_logits(hidden_states,
-                                                  sampling_metadata)
+        return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -1392,4 +1420,4 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         return MultiModelKeys.from_string_field(
             language_model="language_model",
             connector="mlp1",
-            tower_model="vision_model")
+            tower_model="vision_model")
\ No newline at end of file