Llama3.2 vision model support (#1551)

This commit is contained in:
Liangsheng Yin
2024-10-21 15:01:21 -07:00
committed by GitHub
parent 00611286a1
commit 94cde10920
21 changed files with 1562 additions and 122 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -605,7 +605,11 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
]
positions = forward_batch.mrope_positions
if image_inputs is None or len(image_inputs) == 0:
if (
forward_batch.forward_mode.is_decode()
or image_inputs is None
or len(image_inputs) == 0
):
inputs_embeds = self.model.embed_tokens(input_ids)
else:
if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":