model: Minicpmo (#3023)

This commit is contained in:
Mick
2025-03-25 11:08:40 +08:00
committed by GitHub
parent 64129fa632
commit 1e86457c90
40 changed files with 2906 additions and 493 deletions

View File

@@ -144,11 +144,11 @@ class FINISH_ABORT(BaseFinishReason):
@dataclasses.dataclass
class ImageInputs:
class MultimodalInputs:
"""The image related inputs."""
pixel_values: Union[torch.Tensor, np.array]
image_hashes: Optional[list] = None
data_hashes: Optional[list] = None
image_sizes: Optional[list] = None
image_offsets: Optional[list] = None
image_pad_len: Optional[list] = None
@@ -182,20 +182,27 @@ class ImageInputs:
im_end_id: Optional[int] = None
slice_start_id: Optional[int] = None
slice_end_id: Optional[int] = None
# [num_images, 2 (w, h)]
tgt_sizes: Optional[list] = None
# audio
audio_start_id: Optional[torch.Tensor] = None
audio_end_id: Optional[torch.Tensor] = None
audio_features: Optional[List[torch.Tensor]] = None
audio_feature_lens: Optional[List[torch.Tensor]] = None
@staticmethod
def from_dict(obj: dict):
ret = ImageInputs(
ret = MultimodalInputs(
pixel_values=obj["pixel_values"],
image_hashes=obj["image_hashes"],
data_hashes=obj["data_hashes"],
)
# Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
# Please note that if the `input_ids` is later used in the model forward,
# you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
# errors in cuda kernels. See also llava.py for example.
ret.pad_values = [x % (1 << 30) for x in ret.image_hashes]
ret.pad_values = [x % (1 << 30) for x in ret.data_hashes]
optional_args = [
"image_sizes",
@@ -211,6 +218,10 @@ class ImageInputs:
"slice_start_id",
"slice_end_id",
"tgt_sizes",
"audio_start_id",
"audio_end_id",
"audio_features",
"audio_feature_lens",
]
for arg in optional_args:
if arg in obj:
@@ -223,9 +234,19 @@ class ImageInputs:
or isinstance(ret.pixel_values, list)
)
assert ret.audio_features is None or isinstance(ret.audio_features, list)
return ret
def merge(self, other: ImageInputs):
def contains_image_inputs(self) -> bool:
""" """
return self.pixel_values is not None and self.pixel_values != []
def contains_audio_inputs(self) -> bool:
""" """
return self.audio_features is not None and self.audio_features != []
def merge(self, other: MultimodalInputs):
"""
merge image inputs when requests are being merged
"""
@@ -268,10 +289,12 @@ class ImageInputs:
# Please note that if the `input_ids` is later used in the model forward,
# you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
# errors in cuda kernels. See also llava.py for example.
self.image_hashes += other.image_hashes
self.pad_values = [x % (1 << 30) for x in self.image_hashes]
self.data_hashes += other.data_hashes
self.pad_values = [x % (1 << 30) for x in self.data_hashes]
# args needed to be merged
optional_args = [
"audio_features",
"image_sizes",
"image_offsets",
"image_pad_len",
@@ -362,7 +385,7 @@ class Req:
self.decoded_text = ""
# For multimodal inputs
self.image_inputs: Optional[ImageInputs] = None
self.multimodal_inputs: Optional[MultimodalInputs] = None
# Prefix info
# The indices to kv cache for the shared prefix.
@@ -458,10 +481,10 @@ class Req:
return len(self.origin_input_ids) + len(self.output_ids)
def extend_image_inputs(self, image_inputs):
if self.image_inputs is None:
self.image_inputs = image_inputs
if self.multimodal_inputs is None:
self.multimodal_inputs = image_inputs
else:
self.image_inputs.merge(image_inputs)
self.multimodal_inputs.merge(image_inputs)
def finished(self) -> bool:
# Whether request reached finished condition
@@ -802,7 +825,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
self.encoder_cached = []
for req in self.reqs:
im = req.image_inputs
im = req.multimodal_inputs
if im is None or im.num_image_tokens is None:
# No image input
self.encoder_lens_cpu.append(0)
@@ -1391,7 +1414,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
extend_seq_lens=extend_seq_lens,
extend_prefix_lens=extend_prefix_lens,
extend_logprob_start_lens=extend_logprob_start_lens,
image_inputs=[r.image_inputs for r in self.reqs],
multimodal_inputs=[r.multimodal_inputs for r in self.reqs],
encoder_cached=self.encoder_cached,
encoder_lens=self.encoder_lens,
encoder_lens_cpu=self.encoder_lens_cpu,
@@ -1474,7 +1497,7 @@ class ModelWorkerBatch:
extend_input_logprob_token_ids: Optional[torch.Tensor]
# For multimodal
image_inputs: Optional[List[ImageInputs]]
multimodal_inputs: Optional[List[MultimodalInputs]]
# For encoder-decoder
encoder_cached: Optional[List[bool]]